From 84c61d92bb6e9048eecc0738a83f1bf66f053026 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 1 Aug 2014 11:13:30 +0300 Subject: Bluetooth: Add convenience function to check for pending power off There are several situations where we're interested in knowing whether we're currently in the process of powering off an adapter. This patch adds a convenience function for the purpose and makes it public since we'll soon need to access it from hci_event.c as well. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b5d5af3aa469..8394abc4fd87 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1351,6 +1351,7 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, s8 rssi, u8 *name, u8 name_len); void mgmt_discovering(struct hci_dev *hdev, u8 discovering); +bool mgmt_powering_down(struct hci_dev *hdev); void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent); void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk); void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, -- cgit v1.2.3 From 432df05eb1e57adfc46df08abbedca6c3b8862f7 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 1 Aug 2014 11:13:31 +0300 Subject: Bluetooth: Create unified helper function for updating page scan Similar to our hci_update_background_scan() function we can simplify a lot of code by creating a unified helper function for doing page scan updates. This patch adds such a function to hci_core.c and updates all the relevant places to use it. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 5 +++ net/bluetooth/hci_core.c | 31 +++++++++++++++++ net/bluetooth/mgmt.c | 72 +++++++--------------------------------- 3 files changed, 48 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8394abc4fd87..cc2eb7730fbc 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -968,6 +968,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_host_le_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE)) #define lmp_host_le_br_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE_BREDR)) +#define hdev_is_powered(hdev) (test_bit(HCI_UP, &hdev->flags) && \ + !test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 @@ -1256,6 +1259,8 @@ bool hci_req_pending(struct hci_dev *hdev); void hci_req_add_le_scan_disable(struct hci_request *req); void hci_req_add_le_passive_scan(struct hci_request *req); +void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req); + struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout); struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c32d361c0cf7..a031589598b2 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -5680,3 +5680,34 @@ void hci_update_background_scan(struct hci_dev *hdev) if (err) BT_ERR("Failed to run HCI request: err %d", err); } + +void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req) +{ + u8 scan; + + if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + return; + + if (!hdev_is_powered(hdev)) + return; + + if (mgmt_powering_down(hdev)) + return; + + if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || + !list_empty(&hdev->whitelist)) + scan = SCAN_PAGE; + else + scan = SCAN_DISABLED; + + if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE)) + return; + + if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) + scan |= SCAN_INQUIRY; + + if (req) + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + else + hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); +} diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c9de0d9945f5..c2457435a670 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -129,9 +129,6 @@ static const u16 mgmt_events[] = { #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) -#define hdev_is_powered(hdev) (test_bit(HCI_UP, &hdev->flags) && \ - !test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) - struct pending_cmd { struct list_head list; u16 opcode; @@ -1536,9 +1533,11 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status) /* When the discoverable mode gets changed, make sure * that class of device has the limited discoverable - * bit correctly set. + * bit correctly set. Also update page scan based on whitelist + * entries. */ hci_req_init(&req, hdev); + hci_update_page_scan(hdev, &req); update_class(&req); hci_req_run(&req, NULL); @@ -1785,6 +1784,7 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status) if (conn_changed || discov_changed) { new_settings(hdev, cmd->sk); + hci_update_page_scan(hdev, NULL); if (discov_changed) mgmt_update_adv_data(hdev); hci_update_background_scan(hdev); @@ -1818,6 +1818,7 @@ static int set_connectable_update_settings(struct hci_dev *hdev, return err; if (changed) { + hci_update_page_scan(hdev, NULL); hci_update_background_scan(hdev); return new_settings(hdev, sk); } @@ -4381,27 +4382,6 @@ unlock: return err; } -static void set_bredr_scan(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - u8 scan = 0; - - /* Ensure that fast connectable is disabled. This function will - * not do anything if the page scan parameters are already what - * they should be. - */ - write_fast_connectable(req, false); - - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || - !list_empty(&hdev->whitelist)) - scan |= SCAN_PAGE; - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) - scan |= SCAN_INQUIRY; - - if (scan) - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - static void set_bredr_complete(struct hci_dev *hdev, u8 status) { struct pending_cmd *cmd; @@ -4507,9 +4487,8 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_req_init(&req, hdev); - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || - !list_empty(&hdev->whitelist)) - set_bredr_scan(&req); + write_fast_connectable(&req, false); + hci_update_page_scan(hdev, &req); /* Since only the advertising data flags will change, there * is no need to update the scan response data. @@ -5235,27 +5214,6 @@ unlock: return err; } -/* Helper for Add/Remove Device commands */ -static void update_page_scan(struct hci_dev *hdev, u8 scan) -{ - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - return; - - if (!hdev_is_powered(hdev)) - return; - - /* If HCI_CONNECTABLE is set then Add/Remove Device should not - * make any changes to page scanning. - */ - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) - return; - - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) - scan |= SCAN_INQUIRY; - - hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - static void device_added(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type, u8 action) { @@ -5291,8 +5249,6 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (cp->addr.type == BDADDR_BREDR) { - bool update_scan; - /* Only incoming connections action is supported for now */ if (cp->action != 0x01) { err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, @@ -5301,15 +5257,12 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - update_scan = list_empty(&hdev->whitelist); - err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr, cp->addr.type); if (err) goto unlock; - if (update_scan) - update_page_scan(hdev, SCAN_PAGE); + hci_update_page_scan(hdev, NULL); goto added; } @@ -5392,8 +5345,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (list_empty(&hdev->whitelist)) - update_page_scan(hdev, SCAN_DISABLED); + hci_update_page_scan(hdev, NULL); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); @@ -5444,7 +5396,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, kfree(b); } - update_page_scan(hdev, SCAN_DISABLED); + hci_update_page_scan(hdev, NULL); list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) { if (p->auto_connect == HCI_AUTO_CONN_DISABLED) @@ -5969,8 +5921,8 @@ static int powered_update_hci(struct hci_dev *hdev) sizeof(link_sec), &link_sec); if (lmp_bredr_capable(hdev)) { - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - set_bredr_scan(&req); + write_fast_connectable(&req, false); + hci_update_page_scan(hdev, &req); update_class(&req); update_name(&req); update_eir(&req); -- cgit v1.2.3 From d52deb17489b8155e031fb1a9f116c602d719e11 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:44 +0300 Subject: Bluetooth: Resume BT_CONNECTED state after LE security elevation The LE ATT socket uses a special trick where it temporarily sets BT_CONFIG state for the duration of a security level elevation. In order to not require special hacks for going back to BT_CONNECTED state in the l2cap_core.c code the most reasonable place to resume the state is the resume callback. This patch adds a new flag to track the pending security level change and ensures that the state is set back to BT_CONNECTED in the resume callback in case the flag is set. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_sock.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 8df15ad0d43f..4a51e7596608 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -708,6 +708,7 @@ enum { FLAG_EFS_ENABLE, FLAG_DEFER_SETUP, FLAG_LE_CONN_REQ_SENT, + FLAG_PENDING_SECURITY, }; enum { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1884f72083c2..5a42f6a818c0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -790,6 +790,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, if (chan->scid == L2CAP_CID_ATT) { if (smp_conn_security(conn->hcon, sec.level)) break; + set_bit(FLAG_PENDING_SECURITY, &chan->flags); sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; @@ -1359,6 +1360,11 @@ static void l2cap_sock_resume_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) { + sk->sk_state = BT_CONNECTED; + chan->state = BT_CONNECTED; + } + clear_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); sk->sk_state_change(sk); } -- cgit v1.2.3 From f193844c51e88ea3d2137bb0c1d38d27d37691a2 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:15 +0300 Subject: Bluetooth: Add more L2CAP convenience callbacks In preparation for converting SMP to use l2cap_chan it's useful to add a few more callback helpers so that smp.c won't need to define all of its own. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 4a51e7596608..a72965f6bc2f 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -838,18 +838,43 @@ static inline struct l2cap_chan *l2cap_chan_no_new_connection(struct l2cap_chan return NULL; } +static inline int l2cap_chan_no_recv(struct l2cap_chan *chan, struct sk_buff *skb) +{ + return -ENOSYS; +} + +static inline struct sk_buff *l2cap_chan_no_alloc_skb(struct l2cap_chan *chan, + unsigned long hdr_len, + unsigned long len, int nb) +{ + return ERR_PTR(-ENOSYS); +} + static inline void l2cap_chan_no_teardown(struct l2cap_chan *chan, int err) { } +static inline void l2cap_chan_no_close(struct l2cap_chan *chan) +{ +} + static inline void l2cap_chan_no_ready(struct l2cap_chan *chan) { } +static inline void l2cap_chan_no_state_change(struct l2cap_chan *chan, + int state, int err) +{ +} + static inline void l2cap_chan_no_defer(struct l2cap_chan *chan) { } +static inline void l2cap_chan_no_suspend(struct l2cap_chan *chan) +{ +} + static inline void l2cap_chan_no_resume(struct l2cap_chan *chan) { } -- cgit v1.2.3 From 70db83c4bcdc1447bbcb318389561c90d7056b18 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:16 +0300 Subject: Bluetooth: Add SMP L2CAP channel skeleton This patch creates the initial SMP L2CAP channels and a skeleton for their callbacks. There is one per-adapter channel created upon adapter registration, and then one channel per-connection created through the new_connection callback. The channels are registered with the reserved CID 0x1f for now in order to not conflict with existing SMP functionality. Once everything is in place the value can be changed to what it should be, i.e. L2CAP_CID_SMP. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/l2cap.h | 1 + net/bluetooth/smp.c | 131 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 132 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cc2eb7730fbc..2571fc1cb1c5 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -303,6 +303,7 @@ struct hci_dev { __u32 req_result; struct crypto_blkcipher *tfm_aes; + void *smp_data; struct discovery_state discovery; struct hci_conn_hash conn_hash; diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index a72965f6bc2f..1a037ba4b6f4 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -637,6 +637,7 @@ struct l2cap_conn { struct delayed_work security_timer; struct smp_chan *smp_chan; + struct l2cap_chan *smp; struct list_head chan_l; struct mutex chan_lock; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ab07649ecc77..2362ae35a4d5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1456,8 +1456,106 @@ int smp_distribute_keys(struct l2cap_conn *conn) return 0; } +static void smp_teardown_cb(struct l2cap_chan *chan, int err) +{ + struct l2cap_conn *conn = chan->conn; + + BT_DBG("chan %p", chan); + + conn->smp = NULL; + l2cap_chan_put(chan); +} + +static void smp_ready_cb(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + + BT_DBG("chan %p", chan); + + conn->smp = chan; + l2cap_chan_hold(chan); +} + +static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, + unsigned long hdr_len, + unsigned long len, int nb) +{ + struct sk_buff *skb; + + skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + skb->priority = HCI_PRIO_MAX; + bt_cb(skb)->chan = chan; + + return skb; +} + +static const struct l2cap_ops smp_chan_ops = { + .name = "Security Manager", + .ready = smp_ready_cb, + .alloc_skb = smp_alloc_skb_cb, + .teardown = smp_teardown_cb, + + .new_connection = l2cap_chan_no_new_connection, + .recv = l2cap_chan_no_recv, + .state_change = l2cap_chan_no_state_change, + .close = l2cap_chan_no_close, + .defer = l2cap_chan_no_defer, + .suspend = l2cap_chan_no_suspend, + .resume = l2cap_chan_no_resume, + .set_shutdown = l2cap_chan_no_set_shutdown, + .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, +}; + +static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) +{ + struct l2cap_chan *chan; + + BT_DBG("pchan %p", pchan); + + chan = l2cap_chan_create(); + if (!chan) + return NULL; + + chan->chan_type = pchan->chan_type; + chan->ops = &smp_chan_ops; + chan->scid = pchan->scid; + chan->dcid = chan->scid; + chan->imtu = pchan->imtu; + chan->omtu = pchan->omtu; + chan->mode = pchan->mode; + + BT_DBG("created chan %p", chan); + + return chan; +} + +static const struct l2cap_ops smp_root_chan_ops = { + .name = "Security Manager Root", + .new_connection = smp_new_conn_cb, + + /* None of these are implemented for the root channel */ + .close = l2cap_chan_no_close, + .alloc_skb = l2cap_chan_no_alloc_skb, + .recv = l2cap_chan_no_recv, + .state_change = l2cap_chan_no_state_change, + .teardown = l2cap_chan_no_teardown, + .ready = l2cap_chan_no_ready, + .defer = l2cap_chan_no_defer, + .suspend = l2cap_chan_no_suspend, + .resume = l2cap_chan_no_resume, + .set_shutdown = l2cap_chan_no_set_shutdown, + .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, +}; + int smp_register(struct hci_dev *hdev) { + struct l2cap_chan *chan; + BT_DBG("%s", hdev->name); hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, @@ -1469,15 +1567,46 @@ int smp_register(struct hci_dev *hdev) return err; } + chan = l2cap_chan_create(); + if (!chan) { + crypto_free_blkcipher(hdev->tfm_aes); + hdev->tfm_aes = NULL; + return -ENOMEM; + } + + /* FIXME: Using reserved 0x1f value for now - to be changed to + * L2CAP_CID_SMP once all functionality is in place. + */ + l2cap_add_scid(chan, 0x1f); + + l2cap_chan_set_defaults(chan); + + bacpy(&chan->src, &hdev->bdaddr); + chan->src_type = BDADDR_LE_PUBLIC; + chan->state = BT_LISTEN; + chan->mode = L2CAP_MODE_BASIC; + chan->imtu = L2CAP_DEFAULT_MTU; + chan->ops = &smp_root_chan_ops; + + hdev->smp_data = chan; + return 0; } void smp_unregister(struct hci_dev *hdev) { - BT_DBG("%s", hdev->name); + struct l2cap_chan *chan = hdev->smp_data; + + if (!chan) + return; + + BT_DBG("%s chan %p", hdev->name, chan); if (hdev->tfm_aes) { crypto_free_blkcipher(hdev->tfm_aes); hdev->tfm_aes = NULL; } + + hdev->smp_data = NULL; + l2cap_chan_put(chan); } -- cgit v1.2.3 From defce9e83666658d4420d65e45ab1ad190992f72 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:17 +0300 Subject: Bluetooth: Make AES crypto context private to SMP Now that we have per-adapter SMP data thanks to the root SMP L2CAP channel we can take advantage of it and attach the AES crypto context (only used for SMP) to it. This means that the smp_irk_matches() and smp_generate_rpa() function can be converted to internally handle the AES context. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - net/bluetooth/hci_core.c | 13 ++----------- net/bluetooth/smp.c | 41 +++++++++++++++++++++++++++------------- net/bluetooth/smp.h | 5 ++--- 4 files changed, 32 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2571fc1cb1c5..5f0b77b71b45 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -302,7 +302,6 @@ struct hci_dev { __u32 req_status; __u32 req_result; - struct crypto_blkcipher *tfm_aes; void *smp_data; struct discovery_state discovery; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 88575a633601..abeb5e47311e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3234,11 +3234,8 @@ struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) return irk; } - if (!hdev->tfm_aes) - return NULL; - list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { - if (smp_irk_matches(hdev->tfm_aes, irk->val, rpa)) { + if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); return irk; } @@ -3887,13 +3884,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, !bacmp(&hdev->random_addr, &hdev->rpa)) return 0; - if (!hdev->tfm_aes) { - BT_ERR("%s crypto not available to generate RPA", - hdev->name); - return -EOPNOTSUPP; - } - - err = smp_generate_rpa(hdev->tfm_aes, hdev->irk, &hdev->rpa); + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { BT_ERR("%s failed to generate new RPA", hdev->name); return err; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 2362ae35a4d5..6925fc4caaee 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -139,12 +139,18 @@ static int smp_ah(struct crypto_blkcipher *tfm, u8 irk[16], u8 r[3], u8 res[3]) return 0; } -bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], - bdaddr_t *bdaddr) +bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr) { + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm; u8 hash[3]; int err; + if (!chan || !chan->data) + return false; + + tfm = chan->data; + BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(tfm, irk, &bdaddr->b[3], hash); @@ -154,10 +160,17 @@ bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], return !memcmp(bdaddr->b, hash, 3); } -int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa) +int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa) { + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm; int err; + if (!chan || !chan->data) + return -EOPNOTSUPP; + + tfm = chan->data; + get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ @@ -1555,25 +1568,25 @@ static const struct l2cap_ops smp_root_chan_ops = { int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; + struct crypto_blkcipher *tfm_aes; BT_DBG("%s", hdev->name); - hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(hdev->tfm_aes)) { - int err = PTR_ERR(hdev->tfm_aes); + tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_aes)) { + int err = PTR_ERR(tfm_aes); BT_ERR("Unable to create crypto context"); - hdev->tfm_aes = NULL; return err; } chan = l2cap_chan_create(); if (!chan) { - crypto_free_blkcipher(hdev->tfm_aes); - hdev->tfm_aes = NULL; + crypto_free_blkcipher(tfm_aes); return -ENOMEM; } + chan->data = tfm_aes; + /* FIXME: Using reserved 0x1f value for now - to be changed to * L2CAP_CID_SMP once all functionality is in place. */ @@ -1596,15 +1609,17 @@ int smp_register(struct hci_dev *hdev) void smp_unregister(struct hci_dev *hdev) { struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm_aes; if (!chan) return; BT_DBG("%s chan %p", hdev->name, chan); - if (hdev->tfm_aes) { - crypto_free_blkcipher(hdev->tfm_aes); - hdev->tfm_aes = NULL; + tfm_aes = chan->data; + if (tfm_aes) { + chan->data = NULL; + crypto_free_blkcipher(tfm_aes); } hdev->smp_data = NULL; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 6e29359f60a3..d3f6bd617940 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -132,9 +132,8 @@ int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); void smp_chan_destroy(struct l2cap_conn *conn); -bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], - bdaddr_t *bdaddr); -int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa); +bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr); +int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa); int smp_register(struct hci_dev *hdev); void smp_unregister(struct hci_dev *hdev); -- cgit v1.2.3 From 5d88cc73dded31a93fcc4821f33a8c3d755bf454 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:18 +0300 Subject: Bluetooth: Convert SMP to use l2cap_chan infrastructure Now that we have all the necessary pieces in place we can fully convert SMP to use the L2CAP channel infrastructure. This patch adds the necessary callbacks and removes the now unneeded conn->smp_chan pointer. One notable behavioral change in this patch comes from the following code snippet: - case L2CAP_CID_SMP: - if (smp_sig_channel(conn, skb)) - l2cap_conn_del(conn->hcon, EACCES); This piece of code was essentially forcing a disconnection if garbage SMP data was received. The l2cap_conn_del() function is private to l2cap_conn.c so we don't have access to it anymore when using the L2CAP channel callbacks. Therefore, the behavior of the new code is simply to return errors in the recv() callback (which is simply the old smp_sig_channel()), but no disconnection will occur. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 - net/bluetooth/l2cap_core.c | 10 ---- net/bluetooth/smp.c | 122 +++++++++++++++++++++++------------------- net/bluetooth/smp.h | 1 - 4 files changed, 66 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 1a037ba4b6f4..bda6252e3722 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -636,7 +636,6 @@ struct l2cap_conn { __u8 disc_reason; struct delayed_work security_timer; - struct smp_chan *smp_chan; struct l2cap_chan *smp; struct list_head chan_l; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 8acfe6f57b5e..40b6c06ab2c0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1651,11 +1651,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); - smp_chan_destroy(conn); - } - hcon->l2cap_data = NULL; conn->hchan = NULL; l2cap_conn_put(conn); @@ -6862,11 +6857,6 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) l2cap_le_sig_channel(conn, skb); break; - case L2CAP_CID_SMP: - if (smp_sig_channel(conn, skb)) - l2cap_conn_del(conn->hcon, EACCES); - break; - default: l2cap_data_channel(conn, cid, skb); break; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 6925fc4caaee..744f678ac3e8 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -248,44 +248,29 @@ static int smp_s1(struct smp_chan *smp, u8 k[16], u8 r1[16], u8 r2[16], return err; } -static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, - u16 dlen, void *data) +static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { - struct sk_buff *skb; - struct l2cap_hdr *lh; - int len; - - len = L2CAP_HDR_SIZE + sizeof(code) + dlen; - - if (len > conn->mtu) - return NULL; + struct l2cap_chan *chan = conn->smp; + struct kvec iv[2]; + struct msghdr msg; - skb = bt_skb_alloc(len, GFP_ATOMIC); - if (!skb) - return NULL; + if (!chan) + return; - lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); - lh->len = cpu_to_le16(sizeof(code) + dlen); - lh->cid = cpu_to_le16(L2CAP_CID_SMP); + BT_DBG("code 0x%2.2x", code); - memcpy(skb_put(skb, sizeof(code)), &code, sizeof(code)); + iv[0].iov_base = &code; + iv[0].iov_len = 1; - memcpy(skb_put(skb, dlen), data, dlen); + iv[1].iov_base = data; + iv[1].iov_len = len; - return skb; -} + memset(&msg, 0, sizeof(msg)); -static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) -{ - struct sk_buff *skb = smp_build_cmd(conn, code, len, data); + msg.msg_iov = (struct iovec *) &iv; + msg.msg_iovlen = 2; - BT_DBG("code 0x%2.2x", code); - - if (!skb) - return; - - skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, 0); + l2cap_chan_send(chan, &msg, 1 + len); cancel_delayed_work_sync(&conn->security_timer); schedule_delayed_work(&conn->security_timer, SMP_TIMEOUT); @@ -315,7 +300,8 @@ static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0; @@ -358,7 +344,8 @@ static void build_pairing_cmd(struct l2cap_conn *conn, static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || (max_key_size < SMP_MIN_ENC_KEY_SIZE)) @@ -418,7 +405,8 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { struct hci_conn *hcon = conn->hcon; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; u8 method; u32 passkey = 0; int ret = 0; @@ -589,6 +577,7 @@ static u8 smp_random(struct smp_chan *smp) static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { + struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; smp = kzalloc(sizeof(*smp), GFP_ATOMIC); @@ -606,7 +595,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) } smp->conn = conn; - conn->smp_chan = smp; + chan->data = smp; hci_conn_hold(conn->hcon); @@ -615,7 +604,8 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) void smp_chan_destroy(struct l2cap_conn *conn) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; bool complete; BUG_ON(!smp); @@ -646,14 +636,15 @@ void smp_chan_destroy(struct l2cap_conn *conn) } } + chan->data = NULL; kfree(smp); - conn->smp_chan = NULL; hci_conn_drop(conn->hcon); } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; struct smp_chan *smp; u32 value; @@ -662,7 +653,11 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) if (!conn || !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) return -ENOTCONN; - smp = conn->smp_chan; + chan = conn->smp; + if (!chan) + return -ENOTCONN; + + smp = chan->data; switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: @@ -709,10 +704,12 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (conn->hcon->role != HCI_ROLE_SLAVE) return SMP_CMD_NOTSUPP; - if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { smp = smp_chan_create(conn); - else - smp = conn->smp_chan; + } else { + struct l2cap_chan *chan = conn->smp; + smp = chan->data; + } if (!smp) return SMP_UNSPECIFIED; @@ -766,7 +763,8 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; u8 key_size, auth = SMP_AUTH_NONE; int ret; @@ -827,7 +825,8 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); @@ -850,7 +849,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p", conn); @@ -1023,7 +1023,8 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p", conn); @@ -1044,7 +1045,8 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_master_ident *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; struct smp_ltk *ltk; @@ -1080,7 +1082,8 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_info *info = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG(""); @@ -1102,7 +1105,8 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_addr_info *info = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; @@ -1156,7 +1160,8 @@ distribute: static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_sign_info *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct smp_csrk *csrk; @@ -1187,8 +1192,9 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) +static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { + struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; __u8 code, reason; int err = 0; @@ -1290,7 +1296,8 @@ done: static void smp_notify_keys(struct l2cap_conn *conn) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req = (void *) &smp->preq[1]; @@ -1357,7 +1364,8 @@ static void smp_notify_keys(struct l2cap_conn *conn) int smp_distribute_keys(struct l2cap_conn *conn) { struct smp_cmd_pairing *req, *rsp; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; __u8 *keydist; @@ -1475,6 +1483,11 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) BT_DBG("chan %p", chan); + if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { + cancel_delayed_work_sync(&conn->security_timer); + smp_chan_destroy(conn); + } + conn->smp = NULL; l2cap_chan_put(chan); } @@ -1508,11 +1521,11 @@ static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, static const struct l2cap_ops smp_chan_ops = { .name = "Security Manager", .ready = smp_ready_cb, + .recv = smp_recv_cb, .alloc_skb = smp_alloc_skb_cb, .teardown = smp_teardown_cb, .new_connection = l2cap_chan_no_new_connection, - .recv = l2cap_chan_no_recv, .state_change = l2cap_chan_no_state_change, .close = l2cap_chan_no_close, .defer = l2cap_chan_no_defer, @@ -1587,10 +1600,7 @@ int smp_register(struct hci_dev *hdev) chan->data = tfm_aes; - /* FIXME: Using reserved 0x1f value for now - to be changed to - * L2CAP_CID_SMP once all functionality is in place. - */ - l2cap_add_scid(chan, 0x1f); + l2cap_add_scid(chan, L2CAP_CID_SMP); l2cap_chan_set_defaults(chan); diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index d3f6bd617940..161ace3c3234 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -126,7 +126,6 @@ enum { /* SMP Commands */ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); int smp_distribute_keys(struct l2cap_conn *conn); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); -- cgit v1.2.3 From dec5b49235e2526d7aacf5b93ea48f5e30c2f7c3 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:37 +0300 Subject: Bluetooth: Add public l2cap_conn_shutdown() API to request disconnection Since we no-longer do special handling of SMP within l2cap_core.c we don't have any code for calling l2cap_conn_del() when smp.c doesn't like the data it gets. At the same time we cannot simply export l2cap_conn_del() since it will try to lock the channels it calls into whereas we already hold the lock in the smp.c l2cap_chan callbacks (i.e. it'd lead to a deadlock). This patch adds a new l2cap_conn_shutdown() API which is very similar to l2cap_conn_del() except that it defers the call to l2cap_conn_del() through a workqueue, thereby making it safe to use it from an L2CAP channel callback. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 4 ++++ net/bluetooth/l2cap_core.c | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index bda6252e3722..40f34866b6da 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -625,6 +625,9 @@ struct l2cap_conn { struct delayed_work info_timer; + int disconn_err; + struct work_struct disconn_work; + struct sk_buff *rx_skb; __u32 rx_len; __u8 tx_ident; @@ -944,6 +947,7 @@ void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, u8 status); void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); +void l2cap_conn_shutdown(struct l2cap_conn *conn, int err); void l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 4de1e1827055..404998efa7e2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1627,6 +1627,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + if (work_pending(&conn->disconn_work)) + cancel_work_sync(&conn->disconn_work); + l2cap_unregister_all_users(conn); mutex_lock(&conn->chan_lock); @@ -1669,6 +1672,26 @@ static void security_timeout(struct work_struct *work) } } +static void disconn_work(struct work_struct *work) +{ + struct l2cap_conn *conn = container_of(work, struct l2cap_conn, + disconn_work); + + BT_DBG("conn %p", conn); + + l2cap_conn_del(conn->hcon, conn->disconn_err); +} + +void l2cap_conn_shutdown(struct l2cap_conn *conn, int err) +{ + struct hci_dev *hdev = conn->hcon->hdev; + + BT_DBG("conn %p err %d", conn, err); + + conn->disconn_err = err; + queue_work(hdev->workqueue, &conn->disconn_work); +} + static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); @@ -6930,6 +6953,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) else INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + INIT_WORK(&conn->disconn_work, disconn_work); + skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); -- cgit v1.2.3 From 276d807317dead63ef2f13aa46e3c17d57ba0713 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:41 +0300 Subject: Bluetooth: Remove unused l2cap_conn->security_timer Now that there are no-longer any users for l2cap_conn->security_timer we can go ahead and simply remove it. The patch makes initialization of the conn->info_timer unconditional since it's better not to leave any l2cap_conn data structures uninitialized no matter what the underlying transport. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 - net/bluetooth/l2cap_core.c | 18 +----------------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 40f34866b6da..cedda399f9c0 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -638,7 +638,6 @@ struct l2cap_conn { __u8 disc_reason; - struct delayed_work security_timer; struct l2cap_chan *smp; struct list_head chan_l; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 404998efa7e2..0cd7ed99558b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1659,19 +1659,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_conn_put(conn); } -static void security_timeout(struct work_struct *work) -{ - struct l2cap_conn *conn = container_of(work, struct l2cap_conn, - security_timer.work); - - BT_DBG("conn %p", conn); - - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - smp_chan_destroy(conn); - l2cap_conn_del(conn->hcon, ETIMEDOUT); - } -} - static void disconn_work(struct work_struct *work) { struct l2cap_conn *conn = container_of(work, struct l2cap_conn, @@ -6948,10 +6935,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); - if (hcon->type == LE_LINK) - INIT_DELAYED_WORK(&conn->security_timer, security_timeout); - else - INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); INIT_WORK(&conn->disconn_work, disconn_work); -- cgit v1.2.3 From a74a8c846fb699f3277c0c21278bd4c414074b4a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 Jul 2014 14:50:47 +0200 Subject: mac80211: don't duplicate station QoS capability data We currently track the QoS capability twice: for all peer stations in the WLAN_STA_WME flag, and for any clients associated to an AP interface separately for drivers in the sta->sta.wme field. Remove the WLAN_STA_WME flag and track the capability only in the driver-visible field, getting rid of the limitation that the field is only valid in AP mode. Reviewed-by: Arik Nemtsov Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- net/mac80211/cfg.c | 13 +++---------- net/mac80211/debugfs_sta.c | 3 ++- net/mac80211/ibss.c | 2 +- net/mac80211/mesh_plink.c | 4 +--- net/mac80211/mlme.c | 3 +-- net/mac80211/sta_info.c | 4 ++-- net/mac80211/sta_info.h | 2 -- net/mac80211/tdls.c | 3 +-- net/mac80211/tx.c | 6 +++--- net/mac80211/wme.c | 4 ++-- 11 files changed, 17 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index dae2e24616e1..1cd84444665c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1405,7 +1405,7 @@ struct ieee80211_sta_rates { * @supp_rates: Bitmap of supported rates (per band) * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities - * @wme: indicates whether the STA supports WME. Only valid during AP-mode. + * @wme: indicates whether the STA supports QoS/WME. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 927b4ea0128b..4d8989b87960 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1011,15 +1011,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } - if (mask & BIT(NL80211_STA_FLAG_WME)) { - if (set & BIT(NL80211_STA_FLAG_WME)) { - set_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = true; - } else { - clear_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = false; - } - } + if (mask & BIT(NL80211_STA_FLAG_WME)) + sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); if (mask & BIT(NL80211_STA_FLAG_MFP)) { if (set & BIT(NL80211_STA_FLAG_MFP)) @@ -3352,7 +3345,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, band = chanctx_conf->def.chan->band; sta = sta_info_get_bss(sdata, peer); if (sta) { - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; } else { rcu_read_unlock(); return -ENOLINK; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 3db96648b45a..4a20fb8f1e23 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -77,7 +77,8 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, TEST(AUTH), TEST(ASSOC), TEST(PS_STA), TEST(PS_DRIVER), TEST(AUTHORIZED), TEST(SHORT_PREAMBLE), - TEST(WME), TEST(WDS), TEST(CLEAR_PS_FILT), + sta->sta.wme ? "WME\n" : "", + TEST(WDS), TEST(CLEAR_PS_FILT), TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL), TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), TEST(TDLS_PEER_AUTH), TEST(4ADDR_EVENT), diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9713dc54ea4b..5f9654d31a8d 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1038,7 +1038,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, } if (sta && elems->wmm_info) - set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = true; if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 63b874101b27..8d67c1ebfbda 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -431,14 +431,12 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) return NULL; sta->plink_state = NL80211_PLINK_LISTEN; + sta->sta.wme = true; sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - set_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = true; - return sta; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 31a8afaf7332..0e8d59a0e17e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2677,8 +2677,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) set_sta_flag(sta, WLAN_STA_MFP); - if (elems.wmm_param) - set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = elems.wmm_param; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index c6ee2139fbc5..e1f957d5935e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1179,7 +1179,7 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; - bool qos = test_sta_flag(sta, WLAN_STA_WME); + bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; @@ -1834,7 +1834,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); - if (test_sta_flag(sta, WLAN_STA_WME)) + if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index d411bcc8ef08..89c40d5c0633 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -31,7 +31,6 @@ * when virtual port control is not in use. * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble * frames. - * @WLAN_STA_WME: Station is a QoS-STA. * @WLAN_STA_WDS: Station is one of our WDS peers. * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next @@ -69,7 +68,6 @@ enum ieee80211_sta_info_flags { WLAN_STA_PS_STA, WLAN_STA_AUTHORIZED, WLAN_STA_SHORT_PREAMBLE, - WLAN_STA_WME, WLAN_STA_WDS, WLAN_STA_CLEAR_PS_FILT, WLAN_STA_MFP, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 1b21050be174..f2cb3b6c1871 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -316,8 +316,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, } /* add the QoS param IE if both the peer and we support it */ - if (local->hw.queues >= IEEE80211_NUM_ACS && - test_sta_flag(sta, WLAN_STA_WME)) + if (local->hw.queues >= IEEE80211_NUM_ACS && sta->sta.wme) ieee80211_tdls_add_wmm_param_ie(sdata, skb); /* add any custom IEs that go before HT operation */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 464106c023d8..2051cc60f8ce 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1844,7 +1844,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; } ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); @@ -1957,7 +1957,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER); tdls_auth = test_sta_flag(sta, @@ -2035,7 +2035,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, sta = sta_info_get(sdata, hdr.addr1); if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; } } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index d51422c778de..6459946f0b74 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -118,7 +118,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) { - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; break; } case NL80211_IFTYPE_AP: @@ -145,7 +145,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, if (!sta && ra && !is_multicast_ether_addr(ra)) { sta = sta_info_get(sdata, ra); if (sta) - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; } rcu_read_unlock(); -- cgit v1.2.3 From e91ded8db57472c20b59b2242b100764cc152a10 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 4 Aug 2014 04:50:41 -0400 Subject: uapi: netfilter_arp: use __u8 instead of u_int8_t Similarly, the u_int8_t type is non-standard and not defined. Change it to use __u8 like the rest of the netfilter headers. Signed-off-by: Mike Frysinger Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_arp/arpt_mangle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_arp/arpt_mangle.h b/include/uapi/linux/netfilter_arp/arpt_mangle.h index 250f502902bb..8c2b16a1f5a0 100644 --- a/include/uapi/linux/netfilter_arp/arpt_mangle.h +++ b/include/uapi/linux/netfilter_arp/arpt_mangle.h @@ -13,7 +13,7 @@ struct arpt_mangle union { struct in_addr tgt_ip; } u_t; - u_int8_t flags; + __u8 flags; int target; }; -- cgit v1.2.3 From d4261e5650004d6d51137553ea5433d5828562dc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 19 Aug 2014 16:02:12 +0200 Subject: bonding: create netlink event when bonding option is changed Userspace needs to be notified if one changes some option. Signed-off-by: Jiri Pirko Acked-by: Veaceslav Falico Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- drivers/net/bonding/bond_options.c | 2 ++ include/linux/netdevice.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index dc73463c2c23..d8dc17faa6b4 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -625,6 +625,8 @@ int __bond_opt_set(struct bonding *bond, out: if (ret) bond_opt_error_interpret(bond, opt, ret, val); + else + call_netdevice_notifiers(NETDEV_CHANGEINFODATA, bond->dev); return ret; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 38377392d082..7e2b0b8b5cd7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1982,6 +1982,7 @@ struct pcpu_sw_netstats { #define NETDEV_CHANGEUPPER 0x0015 #define NETDEV_RESEND_IGMP 0x0016 #define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ +#define NETDEV_CHANGEINFODATA 0x0018 int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); -- cgit v1.2.3 From d2de875c6d4cbec8a99c880160181a3ed5b9992e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Aug 2014 18:32:09 -0700 Subject: net: use ktime_get_ns() and ktime_get_real_ns() helpers ktime_get_ns() replaces ktime_to_ns(ktime_get()) ktime_get_real_ns() replaces ktime_to_ns(ktime_get_real()) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/codel.h | 2 +- include/net/pkt_sched.h | 2 +- net/core/secure_seq.c | 6 +++--- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/sched/act_police.c | 4 ++-- net/sched/sch_fq.c | 4 ++-- net/sched/sch_htb.c | 6 +++--- net/sched/sch_tbf.c | 6 +++--- 10 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/codel.h b/include/net/codel.h index fe0eab32ce76..aeee28081245 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -66,7 +66,7 @@ typedef s32 codel_tdiff_t; static inline codel_time_t codel_get_time(void) { - u64 ns = ktime_to_ns(ktime_get()); + u64 ns = ktime_get_ns(); return ns >> CODEL_SHIFT; } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index ec030cd76616..8bbe626e9ece 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -50,7 +50,7 @@ typedef long psched_tdiff_t; static inline psched_time_t psched_get_time(void) { - return PSCHED_NS2TICKS(ktime_to_ns(ktime_get())); + return PSCHED_NS2TICKS(ktime_get_ns()); } static inline psched_tdiff_t diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index ba71212f0251..51dd3193a33e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq) * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ - return seq + (ktime_to_ns(ktime_get_real()) >> 6); + return seq + (ktime_get_real_ns() >> 6); } #endif @@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; @@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, md5_transform(hash, secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index de88c4ab5146..0b634e7f1652 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -358,7 +358,7 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); + tstamp->stop = ktime_get_real_ns(); if (nf_ct_is_dying(ct)) goto delete; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 355a5c4ef763..1bd9ed9e62f6 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1737,7 +1737,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } tstamp = nf_conn_tstamp_find(ct); if (tstamp) - tstamp->start = ktime_to_ns(ktime_get_real()); + tstamp->start = ktime_get_real_ns(); err = nf_conntrack_hash_check_insert(ct); if (err < 0) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index f641751dba9d..cf65a1e040dd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -101,7 +101,7 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos) { struct ct_iter_state *st = seq->private; - st->time_now = ktime_to_ns(ktime_get_real()); + st->time_now = ktime_get_real_ns(); rcu_read_lock(); return ct_get_idx(seq, *pos); } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0566e4606a4a..f32bcb094915 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -231,7 +231,7 @@ override: if (ret != ACT_P_CREATED) return ret; - police->tcfp_t_c = ktime_to_ns(ktime_get()); + police->tcfp_t_c = ktime_get_ns(); police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(hinfo); h = tcf_hash(police->tcf_index, POL_TAB_MASK); @@ -279,7 +279,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcfp_result; } - now = ktime_to_ns(ktime_get()); + now = ktime_get_ns(); toks = min_t(s64, now - police->tcfp_t_c, police->tcfp_burst); if (police->peak_present) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ba32c2b005d0..e12f997e1b4c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -416,7 +416,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_to_ns(ktime_get()); + u64 now = ktime_get_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; @@ -787,7 +787,7 @@ nla_put_failure: static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_to_ns(ktime_get()); + u64 now = ktime_get_ns(); struct tc_fq_qd_stats st = { .gc_flows = q->stat_gc_flows, .highprio_packets = q->stat_internal_packets, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9f949abcacef..aea942ce6008 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -895,7 +895,7 @@ ok: if (!sch->q.qlen) goto fin; - q->now = ktime_to_ns(ktime_get()); + q->now = ktime_get_ns(); start_at = jiffies; next_event = q->now + 5LLU * NSEC_PER_SEC; @@ -1225,7 +1225,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; - parent->t_c = ktime_to_ns(ktime_get()); + parent->t_c = ktime_get_ns(); parent->cmode = HTB_CAN_SEND; } @@ -1455,7 +1455,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->tokens = PSCHED_TICKS2NS(hopt->buffer); cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */ - cl->t_c = ktime_to_ns(ktime_get()); + cl->t_c = ktime_get_ns(); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 18ff63433709..0c39b754083b 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -239,7 +239,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); - now = ktime_to_ns(ktime_get()); + now = ktime_get_ns(); toks = min_t(s64, now - q->t_c, q->buffer); if (tbf_peak_present(q)) { @@ -292,7 +292,7 @@ static void tbf_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - q->t_c = ktime_to_ns(ktime_get()); + q->t_c = ktime_get_ns(); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); @@ -431,7 +431,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - q->t_c = ktime_to_ns(ktime_get()); + q->t_c = ktime_get_ns(); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; -- cgit v1.2.3 From 989e04c5bc3ff77d65e1f0d87bf7904dfa30d41c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 22 Aug 2014 14:15:22 -0700 Subject: tcp: improve undo on timeout Upon timeout, undo (via both timestamps/Eifel and DSACKs) was disabled if any retransmits were still in flight. The concern was perhaps that spurious retransmission sent in a previous recovery episode may trigger DSACKs to falsely undo the current recovery. However, this inadvertently misses undo opportunities (using either TCP timestamps or DSACKs) when timeout occurs during a loss episode, i.e. recurring timeouts or timeout during fast recovery. In these cases some retransmissions will be in flight but we should allow undo. Furthermore, we should only reset undo_marker and undo_retrans upon timeout if we are starting a new recovery episode. Finally, when we do reset our undo state, we now do so in a manner similar to tcp_enter_recovery(), so that we require a DSACK for each of the outstsanding retransmissions. This will achieve the original goal by requiring that we receive the same number of DSACKs as retransmissions. This patch increases the undo events by 50% on Google servers. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 26 +++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fa5258f322e7..e567f0dbf282 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -276,7 +276,7 @@ struct tcp_sock { u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ - u32 undo_marker; /* tracking retrans started here. */ + u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u32 total_retrans; /* Total retransmits for entire connection */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a906e0200ff2..aba4926ca095 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1888,21 +1888,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static void tcp_clear_retrans_partial(struct tcp_sock *tp) +void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; - tp->undo_marker = 0; tp->undo_retrans = -1; + tp->fackets_out = 0; + tp->sacked_out = 0; } -void tcp_clear_retrans(struct tcp_sock *tp) +static inline void tcp_init_undo(struct tcp_sock *tp) { - tcp_clear_retrans_partial(tp); - - tp->fackets_out = 0; - tp->sacked_out = 0; + tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ + tp->undo_retrans = tp->retrans_out ? : -1; } /* Enter Loss state. If we detect SACK reneging, forget all SACK information @@ -1925,18 +1925,18 @@ void tcp_enter_loss(struct sock *sk) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); + tcp_init_undo(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans_partial(tp); + tp->retrans_out = 0; + tp->lost_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - tp->undo_marker = tp->snd_una; - skb = tcp_write_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { @@ -1950,9 +1950,6 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2671,8 +2668,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out ? : -1; + tcp_init_undo(tp); if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) -- cgit v1.2.3 From 3af20efc0f83cdc65ce56ec108c0e81f602364df Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:39 -0700 Subject: net: phy: broadcom: extract all registers to brcmphy.h Commit 439d39a9ac8fbbba9c04581361188f33f21ced50 ("net: phy: broadcom: extract register definitions") added a bunch of registers to brcmphy.h but left some to broadcom.c, move all of them to the header file since the BCM54xx and BCM7xxx PHY drivers do share all of these registers. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 104 --------------------------------------------- include/linux/brcmphy.h | 103 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 104 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 34088d60da74..6001d76d8676 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -25,110 +25,6 @@ #define BRCM_PHY_REV(phydev) \ ((phydev)->drv->phy_id & ~((phydev)->drv->phy_id_mask)) -/* - * Broadcom LED source encodings. These are used in BCM5461, BCM5481, - * BCM5482, and possibly some others. - */ -#define BCM_LED_SRC_LINKSPD1 0x0 -#define BCM_LED_SRC_LINKSPD2 0x1 -#define BCM_LED_SRC_XMITLED 0x2 -#define BCM_LED_SRC_ACTIVITYLED 0x3 -#define BCM_LED_SRC_FDXLED 0x4 -#define BCM_LED_SRC_SLAVE 0x5 -#define BCM_LED_SRC_INTR 0x6 -#define BCM_LED_SRC_QUALITY 0x7 -#define BCM_LED_SRC_RCVLED 0x8 -#define BCM_LED_SRC_MULTICOLOR1 0xa -#define BCM_LED_SRC_OPENSHORT 0xb -#define BCM_LED_SRC_OFF 0xe /* Tied high */ -#define BCM_LED_SRC_ON 0xf /* Tied low */ - - -/* - * BCM5482: Shadow registers - * Shadow values go into bits [14:10] of register 0x1c to select a shadow - * register to access. - */ -/* 00101: Spare Control Register 3 */ -#define BCM54XX_SHD_SCR3 0x05 -#define BCM54XX_SHD_SCR3_DEF_CLK125 0x0001 -#define BCM54XX_SHD_SCR3_DLLAPD_DIS 0x0002 -#define BCM54XX_SHD_SCR3_TRDDAPD 0x0004 - -/* 01010: Auto Power-Down */ -#define BCM54XX_SHD_APD 0x0a -#define BCM54XX_SHD_APD_EN 0x0020 - -#define BCM5482_SHD_LEDS1 0x0d /* 01101: LED Selector 1 */ - /* LED3 / ~LINKSPD[2] selector */ -#define BCM5482_SHD_LEDS1_LED3(src) ((src & 0xf) << 4) - /* LED1 / ~LINKSPD[1] selector */ -#define BCM5482_SHD_LEDS1_LED1(src) ((src & 0xf) << 0) -#define BCM54XX_SHD_RGMII_MODE 0x0b /* 01011: RGMII Mode Selector */ -#define BCM5482_SHD_SSD 0x14 /* 10100: Secondary SerDes control */ -#define BCM5482_SHD_SSD_LEDM 0x0008 /* SSD LED Mode enable */ -#define BCM5482_SHD_SSD_EN 0x0001 /* SSD enable */ -#define BCM5482_SHD_MODE 0x1f /* 11111: Mode Control Register */ -#define BCM5482_SHD_MODE_1000BX 0x0001 /* Enable 1000BASE-X registers */ - - -/* - * EXPANSION SHADOW ACCESS REGISTERS. (PHY REG 0x15, 0x16, and 0x17) - */ -#define MII_BCM54XX_EXP_AADJ1CH0 0x001f -#define MII_BCM54XX_EXP_AADJ1CH0_SWP_ABCD_OEN 0x0200 -#define MII_BCM54XX_EXP_AADJ1CH0_SWSEL_THPF 0x0100 -#define MII_BCM54XX_EXP_AADJ1CH3 0x601f -#define MII_BCM54XX_EXP_AADJ1CH3_ADCCKADJ 0x0002 -#define MII_BCM54XX_EXP_EXP08 0x0F08 -#define MII_BCM54XX_EXP_EXP08_RJCT_2MHZ 0x0001 -#define MII_BCM54XX_EXP_EXP08_EARLY_DAC_WAKE 0x0200 -#define MII_BCM54XX_EXP_EXP75 0x0f75 -#define MII_BCM54XX_EXP_EXP75_VDACCTRL 0x003c -#define MII_BCM54XX_EXP_EXP75_CM_OSC 0x0001 -#define MII_BCM54XX_EXP_EXP96 0x0f96 -#define MII_BCM54XX_EXP_EXP96_MYST 0x0010 -#define MII_BCM54XX_EXP_EXP97 0x0f97 -#define MII_BCM54XX_EXP_EXP97_MYST 0x0c0c - -/* - * BCM5482: Secondary SerDes registers - */ -#define BCM5482_SSD_1000BX_CTL 0x00 /* 1000BASE-X Control */ -#define BCM5482_SSD_1000BX_CTL_PWRDOWN 0x0800 /* Power-down SSD */ -#define BCM5482_SSD_SGMII_SLAVE 0x15 /* SGMII Slave Register */ -#define BCM5482_SSD_SGMII_SLAVE_EN 0x0002 /* Slave mode enable */ -#define BCM5482_SSD_SGMII_SLAVE_AD 0x0001 /* Slave auto-detection */ - - -/*****************************************************************************/ -/* Fast Ethernet Transceiver definitions. */ -/*****************************************************************************/ - -#define MII_BRCM_FET_INTREG 0x1a /* Interrupt register */ -#define MII_BRCM_FET_IR_MASK 0x0100 /* Mask all interrupts */ -#define MII_BRCM_FET_IR_LINK_EN 0x0200 /* Link status change enable */ -#define MII_BRCM_FET_IR_SPEED_EN 0x0400 /* Link speed change enable */ -#define MII_BRCM_FET_IR_DUPLEX_EN 0x0800 /* Duplex mode change enable */ -#define MII_BRCM_FET_IR_ENABLE 0x4000 /* Interrupt enable */ - -#define MII_BRCM_FET_BRCMTEST 0x1f /* Brcm test register */ -#define MII_BRCM_FET_BT_SRE 0x0080 /* Shadow register enable */ - - -/*** Shadow register definitions ***/ - -#define MII_BRCM_FET_SHDW_MISCCTRL 0x10 /* Shadow misc ctrl */ -#define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ - -#define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ -#define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 -#define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 - -#define MII_BRCM_FET_SHDW_AUXSTAT2 0x1b /* Auxiliary status 2 */ -#define MII_BRCM_FET_SHDW_AS2_APDE 0x0020 /* Auto power down enable */ - - MODULE_DESCRIPTION("Broadcom PHY driver"); MODULE_AUTHOR("Maciej W. Rozycki"); MODULE_LICENSE("GPL"); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 61219b9b3445..be31bf9f60c2 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -92,4 +92,107 @@ #define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL 0x0000 +/* + * Broadcom LED source encodings. These are used in BCM5461, BCM5481, + * BCM5482, and possibly some others. + */ +#define BCM_LED_SRC_LINKSPD1 0x0 +#define BCM_LED_SRC_LINKSPD2 0x1 +#define BCM_LED_SRC_XMITLED 0x2 +#define BCM_LED_SRC_ACTIVITYLED 0x3 +#define BCM_LED_SRC_FDXLED 0x4 +#define BCM_LED_SRC_SLAVE 0x5 +#define BCM_LED_SRC_INTR 0x6 +#define BCM_LED_SRC_QUALITY 0x7 +#define BCM_LED_SRC_RCVLED 0x8 +#define BCM_LED_SRC_MULTICOLOR1 0xa +#define BCM_LED_SRC_OPENSHORT 0xb +#define BCM_LED_SRC_OFF 0xe /* Tied high */ +#define BCM_LED_SRC_ON 0xf /* Tied low */ + + +/* + * BCM5482: Shadow registers + * Shadow values go into bits [14:10] of register 0x1c to select a shadow + * register to access. + */ +/* 00101: Spare Control Register 3 */ +#define BCM54XX_SHD_SCR3 0x05 +#define BCM54XX_SHD_SCR3_DEF_CLK125 0x0001 +#define BCM54XX_SHD_SCR3_DLLAPD_DIS 0x0002 +#define BCM54XX_SHD_SCR3_TRDDAPD 0x0004 + +/* 01010: Auto Power-Down */ +#define BCM54XX_SHD_APD 0x0a +#define BCM54XX_SHD_APD_EN 0x0020 + +#define BCM5482_SHD_LEDS1 0x0d /* 01101: LED Selector 1 */ + /* LED3 / ~LINKSPD[2] selector */ +#define BCM5482_SHD_LEDS1_LED3(src) ((src & 0xf) << 4) + /* LED1 / ~LINKSPD[1] selector */ +#define BCM5482_SHD_LEDS1_LED1(src) ((src & 0xf) << 0) +#define BCM54XX_SHD_RGMII_MODE 0x0b /* 01011: RGMII Mode Selector */ +#define BCM5482_SHD_SSD 0x14 /* 10100: Secondary SerDes control */ +#define BCM5482_SHD_SSD_LEDM 0x0008 /* SSD LED Mode enable */ +#define BCM5482_SHD_SSD_EN 0x0001 /* SSD enable */ +#define BCM5482_SHD_MODE 0x1f /* 11111: Mode Control Register */ +#define BCM5482_SHD_MODE_1000BX 0x0001 /* Enable 1000BASE-X registers */ + + +/* + * EXPANSION SHADOW ACCESS REGISTERS. (PHY REG 0x15, 0x16, and 0x17) + */ +#define MII_BCM54XX_EXP_AADJ1CH0 0x001f +#define MII_BCM54XX_EXP_AADJ1CH0_SWP_ABCD_OEN 0x0200 +#define MII_BCM54XX_EXP_AADJ1CH0_SWSEL_THPF 0x0100 +#define MII_BCM54XX_EXP_AADJ1CH3 0x601f +#define MII_BCM54XX_EXP_AADJ1CH3_ADCCKADJ 0x0002 +#define MII_BCM54XX_EXP_EXP08 0x0F08 +#define MII_BCM54XX_EXP_EXP08_RJCT_2MHZ 0x0001 +#define MII_BCM54XX_EXP_EXP08_EARLY_DAC_WAKE 0x0200 +#define MII_BCM54XX_EXP_EXP75 0x0f75 +#define MII_BCM54XX_EXP_EXP75_VDACCTRL 0x003c +#define MII_BCM54XX_EXP_EXP75_CM_OSC 0x0001 +#define MII_BCM54XX_EXP_EXP96 0x0f96 +#define MII_BCM54XX_EXP_EXP96_MYST 0x0010 +#define MII_BCM54XX_EXP_EXP97 0x0f97 +#define MII_BCM54XX_EXP_EXP97_MYST 0x0c0c + +/* + * BCM5482: Secondary SerDes registers + */ +#define BCM5482_SSD_1000BX_CTL 0x00 /* 1000BASE-X Control */ +#define BCM5482_SSD_1000BX_CTL_PWRDOWN 0x0800 /* Power-down SSD */ +#define BCM5482_SSD_SGMII_SLAVE 0x15 /* SGMII Slave Register */ +#define BCM5482_SSD_SGMII_SLAVE_EN 0x0002 /* Slave mode enable */ +#define BCM5482_SSD_SGMII_SLAVE_AD 0x0001 /* Slave auto-detection */ + + +/*****************************************************************************/ +/* Fast Ethernet Transceiver definitions. */ +/*****************************************************************************/ + +#define MII_BRCM_FET_INTREG 0x1a /* Interrupt register */ +#define MII_BRCM_FET_IR_MASK 0x0100 /* Mask all interrupts */ +#define MII_BRCM_FET_IR_LINK_EN 0x0200 /* Link status change enable */ +#define MII_BRCM_FET_IR_SPEED_EN 0x0400 /* Link speed change enable */ +#define MII_BRCM_FET_IR_DUPLEX_EN 0x0800 /* Duplex mode change enable */ +#define MII_BRCM_FET_IR_ENABLE 0x4000 /* Interrupt enable */ + +#define MII_BRCM_FET_BRCMTEST 0x1f /* Brcm test register */ +#define MII_BRCM_FET_BT_SRE 0x0080 /* Shadow register enable */ + + +/*** Shadow register definitions ***/ + +#define MII_BRCM_FET_SHDW_MISCCTRL 0x10 /* Shadow misc ctrl */ +#define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ + +#define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ +#define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 +#define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 + +#define MII_BRCM_FET_SHDW_AUXSTAT2 0x1b /* Auxiliary status 2 */ +#define MII_BRCM_FET_SHDW_AS2_APDE 0x0020 /* Auto power down enable */ + #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3 From 705314797b8b997554b7e9d0ea7b65a497356e53 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:40 -0700 Subject: net: phy: broadcom: move shadow 0x1C register accessors to brcmphy.h The shadow register 0x1C is used both by the BCM54xxx PHYs and the BCM7xxx internal PHYs, move the accessors to a common location so both drivers can use them. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 18 ------------------ include/linux/brcmphy.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 6001d76d8676..854f2c9a7b2b 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -29,24 +29,6 @@ MODULE_DESCRIPTION("Broadcom PHY driver"); MODULE_AUTHOR("Maciej W. Rozycki"); MODULE_LICENSE("GPL"); -/* - * Indirect register access functions for the 1000BASE-T/100BASE-TX/10BASE-T - * 0x1c shadow registers. - */ -static int bcm54xx_shadow_read(struct phy_device *phydev, u16 shadow) -{ - phy_write(phydev, MII_BCM54XX_SHD, MII_BCM54XX_SHD_VAL(shadow)); - return MII_BCM54XX_SHD_DATA(phy_read(phydev, MII_BCM54XX_SHD)); -} - -static int bcm54xx_shadow_write(struct phy_device *phydev, u16 shadow, u16 val) -{ - return phy_write(phydev, MII_BCM54XX_SHD, - MII_BCM54XX_SHD_WRITE | - MII_BCM54XX_SHD_VAL(shadow) | - MII_BCM54XX_SHD_DATA(val)); -} - /* Indirect register access functions for the Expansion Registers */ static int bcm54xx_exp_read(struct phy_device *phydev, u16 regnum) { diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index be31bf9f60c2..722cf26567fa 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -195,4 +195,24 @@ #define MII_BRCM_FET_SHDW_AUXSTAT2 0x1b /* Auxiliary status 2 */ #define MII_BRCM_FET_SHDW_AS2_APDE 0x0020 /* Auto power down enable */ +/* + * Indirect register access functions for the 1000BASE-T/100BASE-TX/10BASE-T + * 0x1c shadow registers. + */ +static inline int bcm54xx_shadow_read(struct phy_device *phydev, u16 shadow) +{ + phy_write(phydev, MII_BCM54XX_SHD, MII_BCM54XX_SHD_VAL(shadow)); + return MII_BCM54XX_SHD_DATA(phy_read(phydev, MII_BCM54XX_SHD)); +} + +static inline int bcm54xx_shadow_write(struct phy_device *phydev, u16 shadow, + u16 val) +{ + return phy_write(phydev, MII_BCM54XX_SHD, + MII_BCM54XX_SHD_WRITE | + MII_BCM54XX_SHD_VAL(shadow) | + MII_BCM54XX_SHD_DATA(val)); +} + + #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3 From 66ce7fb9807b036058aa380bfd2b3851ae25ce39 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:43 -0700 Subject: net: phy: export phy_{read,write}_mmd_indirect Some PHY drivers might need to access Clause 45 registers in Clause 22 compatibility mode to e.g: properly advertise EEE support when disabled by default. Export these two helper functions: phy_read_mmd_indirect() and phy_write_mmd_indirect() for drivers to use them. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 6 ++++-- include/linux/phy.h | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index c94e2a27446a..e7a5893f32ff 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -955,7 +955,7 @@ static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad, * 3) Write reg 13 // MMD Data Command for MMD DEVAD * 3) Read reg 14 // Read MMD data */ -static int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, +int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad, int addr) { struct phy_driver *phydrv = phydev->drv; @@ -971,6 +971,7 @@ static int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, } return value; } +EXPORT_SYMBOL(phy_read_mmd_indirect); /** * phy_write_mmd_indirect - writes data to the MMD registers @@ -988,7 +989,7 @@ static int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, * 3) Write reg 13 // MMD Data Command for MMD DEVAD * 3) Write reg 14 // Write MMD data */ -static void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, +void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, int devad, int addr, u32 data) { struct phy_driver *phydrv = phydev->drv; @@ -1002,6 +1003,7 @@ static void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, phydrv->write_mmd_indirect(phydev, prtad, devad, addr, data); } } +EXPORT_SYMBOL(phy_write_mmd_indirect); /** * phy_init_eee - init and check the EEE feature diff --git a/include/linux/phy.h b/include/linux/phy.h index ed39956b5613..d090cfcaa167 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -597,6 +597,19 @@ static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff)); } +/** + * phy_read_mmd_indirect - reads data from the MMD registers + * @phydev: The PHY device bus + * @prtad: MMD Address + * @devad: MMD DEVAD + * @addr: PHY address on the MII bus + * + * Description: it reads data from the MMD registers (clause 22 to access to + * clause 45) of the specified phy address. + */ +int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, + int devad, int addr); + /** * phy_read - Convenience function for reading a given PHY register * @phydev: the phy_device struct @@ -668,6 +681,20 @@ static inline int phy_write_mmd(struct phy_device *phydev, int devad, return mdiobus_write(phydev->bus, phydev->addr, regnum, val); } +/** + * phy_write_mmd_indirect - writes data to the MMD registers + * @phydev: The PHY device + * @prtad: MMD Address + * @devad: MMD DEVAD + * @addr: PHY address on the MII bus + * @data: data to write in the MMD register + * + * Description: Write data from the MMD registers of the specified + * phy address. + */ +void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, + int devad, int addr, u32 data); + struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); -- cgit v1.2.3 From b8f9a02924bbeb0c46ca4c19561cbe765b80e264 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:45 -0700 Subject: net: phy: bcm7xxx: enable EEE at the PHY level The 28nm Gigabit PHY on BCM7xxx chips comes out of reset with absolutely no EEE capabilities, such that we would actually return that we do not support EEE when accessing 3.20 (MDIO_PCS_EEE_ABLE) registers. Poke through the vendor-specific C45 register to enable EEE globally at the PHY level, and advertise supported EEE modes. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 31 +++++++++++++++++++++++++++++++ include/linux/brcmphy.h | 3 +++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 29e256a4ed57..e98c510b75c5 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -14,6 +14,7 @@ #include #include #include +#include /* Broadcom BCM7xxx internal PHY registers */ #define MII_BCM7XXX_CHANNEL_WIDTH 0x2000 @@ -167,6 +168,32 @@ static int bcm7xxx_apd_enable(struct phy_device *phydev) return bcm54xx_shadow_write(phydev, BCM54XX_SHD_APD, val); } +static int bcm7xxx_eee_enable(struct phy_device *phydev) +{ + int val; + + val = phy_read_mmd_indirect(phydev, BRCM_CL45VEN_EEE_CONTROL, + MDIO_MMD_AN, phydev->addr); + if (val < 0) + return val; + + /* Enable general EEE feature at the PHY level */ + val |= LPI_FEATURE_EN | LPI_FEATURE_EN_DIG1000X; + + phy_write_mmd_indirect(phydev, BRCM_CL45VEN_EEE_CONTROL, + MDIO_MMD_AN, phydev->addr, val); + + /* Advertise supported modes */ + val = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_ADV, + MDIO_MMD_AN, phydev->addr); + + val |= (MDIO_AN_EEE_ADV_100TX | MDIO_AN_EEE_ADV_1000T); + phy_write_mmd_indirect(phydev, MDIO_AN_EEE_ADV, + MDIO_MMD_AN, phydev->addr, val); + + return 0; +} + static int bcm7xxx_28nm_config_init(struct phy_device *phydev) { int ret; @@ -179,6 +206,10 @@ static int bcm7xxx_28nm_config_init(struct phy_device *phydev) if (ret) return ret; + ret = bcm7xxx_eee_enable(phydev); + if (ret) + return ret; + return bcm7xxx_apd_enable(phydev); } diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 722cf26567fa..ee1431d976fa 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -214,5 +214,8 @@ static inline int bcm54xx_shadow_write(struct phy_device *phydev, u16 shadow, MII_BCM54XX_SHD_DATA(val)); } +#define BRCM_CL45VEN_EEE_CONTROL 0x803d +#define LPI_FEATURE_EN 0x8000 +#define LPI_FEATURE_EN_DIG1000X 0x4000 #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3 From 690e36e726d00d2528bc569809048adf61550d80 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 23 Aug 2014 12:13:41 -0700 Subject: net: Allow raw buffers to be passed into the flow dissector. Drivers, and perhaps other entities we have not yet considered, sometimes want to know how deep the protocol headers go before deciding how large of an SKB to allocate and how much of the packet to place into the linear SKB area. For example, consider a driver which has a device which DMAs into pools of pages and then tells the driver where the data went in the DMA descriptor(s). The driver can then build an SKB and reference most of the data via SKB fragments (which are page/offset/length triplets). However at least some of the front of the packet should be placed into the linear SKB area, which comes before the fragments, so that packet processing can get at the headers efficiently. The first thing each protocol layer is going to do is a "pskb_may_pull()" so we might as well aggregate as much of this as possible while we're building the SKB in the driver. Part of supporting this is that we don't have an SKB yet, so we want to be able to let the flow dissector operate on a raw buffer in order to compute the offset of the end of the headers. So now we have a __skb_flow_dissect() which takes an explicit data pointer and length. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++++------ include/net/flow_keys.h | 14 ++++++++++++-- net/core/flow_dissector.c | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index abde271c18ae..18ddf9684a27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2567,20 +2567,26 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); -static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *buffer) +static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *data, int hlen, void *buffer) { - int hlen = skb_headlen(skb); - if (hlen - offset >= len) - return skb->data + offset; + return data + offset; - if (skb_copy_bits(skb, offset, buffer, len) < 0) + if (!skb || + skb_copy_bits(skb, offset, buffer, len) < 0) return NULL; return buffer; } +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + return __skb_header_pointer(skb, offset, len, skb->data, + skb_headlen(skb), buffer); +} + /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 6667a054763a..4040f63932c5 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -27,7 +27,17 @@ struct flow_keys { u8 ip_proto; }; -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow); -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto); +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, int hlen); +static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ + return __skb_flow_dissect(skb, flow, NULL, 0); +} +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen_proto); +static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ + return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); +} u32 flow_hash_from_keys(struct flow_keys *keys); #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5f362c1d0332..660c6492fb78 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -34,29 +34,40 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) { int poff = proto_ports_offset(ip_proto); + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + if (poff >= 0) { __be32 *ports, _ports; - ports = skb_header_pointer(skb, thoff + poff, - sizeof(_ports), &_ports); + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } -EXPORT_SYMBOL(skb_flow_get_ports); +EXPORT_SYMBOL(__skb_flow_get_ports); -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, void *data, int hlen) { int nhoff = skb_network_offset(skb); u8 ip_proto; __be16 proto = skb->protocol; + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + memset(flow, 0, sizeof(*flow)); again: @@ -65,7 +76,7 @@ again: const struct iphdr *iph; struct iphdr _iph; ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) return false; nhoff += iph->ihl * 4; @@ -83,7 +94,7 @@ ip: __be32 flow_label; ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) return false; @@ -113,7 +124,7 @@ ipv6: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) return false; @@ -126,7 +137,7 @@ ipv6: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; proto = hdr->proto; @@ -151,7 +162,7 @@ ipv6: __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; /* @@ -171,8 +182,9 @@ ipv6: const struct ethhdr *eth; struct ethhdr _eth; - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); if (!eth) return false; proto = eth->h_proto; @@ -194,12 +206,12 @@ ipv6: flow->n_proto = proto; flow->ip_proto = ip_proto; - flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); + flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); flow->thoff = (u16) nhoff; return true; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); static u32 hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) -- cgit v1.2.3 From e2a093ff0dbfa4c5d99f25241cf33325e9691d91 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Wed, 6 Aug 2014 13:52:49 +0200 Subject: netfilter: nft_meta: add pkttype support Add pkttype support for ip, ipv6 and inet families of tables. This allows you to fetch the meta packet type based on the link layer information. The loopback traffic is a special case, the packet type is guessed from the network layer header. No special handling for bridge and arp since we're not going to see such traffic in the loopback interface. Joint work with Alvaro Neira Ayuso Signed-off-by: Alvaro Neira Ayuso Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 801bdd1e56e3..98144cdd8986 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -571,6 +571,7 @@ enum nft_exthdr_attributes { * @NFT_META_L4PROTO: layer 4 protocol number * @NFT_META_BRI_IIFNAME: packet input bridge interface name * @NFT_META_BRI_OIFNAME: packet output bridge interface name + * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback */ enum nft_meta_keys { NFT_META_LEN, @@ -592,6 +593,7 @@ enum nft_meta_keys { NFT_META_L4PROTO, NFT_META_BRI_IIFNAME, NFT_META_BRI_OIFNAME, + NFT_META_PKTTYPE, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 852b178c6ae7..4f2862fc12c2 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include /* for TCP_TIME_WAIT */ @@ -124,6 +127,30 @@ void nft_meta_get_eval(const struct nft_expr *expr, dest->data[0] = skb->secmark; break; #endif + case NFT_META_PKTTYPE: + if (skb->pkt_type != PACKET_LOOPBACK) { + dest->data[0] = skb->pkt_type; + break; + } + + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + case NFPROTO_IPV6: + if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + default: + WARN_ON(1); + goto err; + } + break; default: WARN_ON(1); goto err; @@ -195,6 +222,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif + case NFT_META_PKTTYPE: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From afc5be3079796b024823bad42dc5ebf716453575 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Sun, 24 Aug 2014 14:08:36 +0200 Subject: netfilter: nft_meta: Add cpu attribute support Add cpu support to meta expresion. This allows you to match packets with cpu number. Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 98144cdd8986..c9b6f00a3fb7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -572,6 +572,7 @@ enum nft_exthdr_attributes { * @NFT_META_BRI_IIFNAME: packet input bridge interface name * @NFT_META_BRI_OIFNAME: packet output bridge interface name * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback + * @NFT_META_CPU: cpu id through smp_processor_id() */ enum nft_meta_keys { NFT_META_LEN, @@ -594,6 +595,7 @@ enum nft_meta_keys { NFT_META_BRI_IIFNAME, NFT_META_BRI_OIFNAME, NFT_META_PKTTYPE, + NFT_META_CPU, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 4f2862fc12c2..843e099a962d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include /* for TCP_TIME_WAIT */ @@ -151,6 +152,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; } break; + case NFT_META_CPU: + dest->data[0] = smp_processor_id(); + break; default: WARN_ON(1); goto err; @@ -223,6 +227,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_SECMARK: #endif case NFT_META_PKTTYPE: + case NFT_META_CPU: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From 1b05756c48ea07ced9604ef01d11194d936da163 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 5 Aug 2014 22:02:34 +0200 Subject: netfilter: ipset: Fix warn: integer overflows 'sizeof(*map) + size * set->dsize' Dan Carpenter reported that the static checker emits the warning net/netfilter/ipset/ip_set_list_set.c:600 init_list_set() warn: integer overflows 'sizeof(*map) + size * set->dsize' Limit the maximal number of elements in list type of sets. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_list.h | 1 + net/netfilter/ipset/ip_set_list_set.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set_list.h b/include/linux/netfilter/ipset/ip_set_list.h index 68c2aea897f5..fe2622a00151 100644 --- a/include/linux/netfilter/ipset/ip_set_list.h +++ b/include/linux/netfilter/ipset/ip_set_list.h @@ -6,5 +6,6 @@ #define IP_SET_LIST_DEFAULT_SIZE 8 #define IP_SET_LIST_MIN_SIZE 4 +#define IP_SET_LIST_MAX_SIZE 65536 #endif /* __IP_SET_LIST_H */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 3e2317f3cf68..f87adbad6076 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -597,7 +597,9 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) struct set_elem *e; u32 i; - map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); + map = kzalloc(sizeof(*map) + + min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, + GFP_KERNEL); if (!map) return false; -- cgit v1.2.3 From 573e8fca255a27e3573b51f9b183d62641c47a3d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:33:47 -0700 Subject: net: skb_gro_checksum_* functions Add skb_gro_checksum_validate, skb_gro_checksum_validate_zero_check, and skb_gro_checksum_simple_validate, and __skb_gro_checksum_complete. These are the cognates of the normal checksum functions but are used in the gro_receive path and operate on GRO related fields in sk_buffs. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 76 +++++++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 34 ++++++++++++++++++++- 2 files changed, 107 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e2b0b8b5cd7..eb73444e1bd0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1883,7 +1883,13 @@ struct napi_gro_cb { u16 proto; /* Used in udp_gro_receive */ - u16 udp_mark; + u8 udp_mark:1; + + /* GRO checksum is valid */ + u8 csum_valid:1; + + /* Number encapsulation layers crossed */ + u8 encapsulation; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2154,11 +2160,77 @@ static inline void *skb_gro_network_header(struct sk_buff *skb) static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, csum_partial(start, len, 0)); } +/* GRO checksum functions. These are logical equivalents of the normal + * checksum functions (in skbuff.h) except that they operate on the GRO + * offsets and fields in sk_buff. + */ + +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); + +static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, + bool zero_okay, + __sum16 check) +{ + return (skb->ip_summed != CHECKSUM_PARTIAL && + (skb->ip_summed != CHECKSUM_UNNECESSARY || + (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) && + (!zero_okay || check)); +} + +static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, + __wsum psum) +{ + if (NAPI_GRO_CB(skb)->csum_valid && + !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) + return 0; + + NAPI_GRO_CB(skb)->csum = psum; + + return __skb_gro_checksum_complete(skb); +} + +/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level + * checksum or an encapsulated one during GRO. This saves work + * if we fallback to normal path with the packet. + */ +static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (NAPI_GRO_CB(skb)->encapsulation) + skb->encapsulation = 1; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->encapsulation = 0; + } +} + +#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ + compute_pseudo) \ +({ \ + __sum16 __ret = 0; \ + if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ + __ret = __skb_gro_checksum_validate_complete(skb, \ + compute_pseudo(skb, proto)); \ + if (!__ret) \ + skb_gro_incr_csum_unnecessary(skb); \ + __ret; \ +}) + +#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) + +#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ + compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) + +#define skb_gro_checksum_simple_validate(skb) \ + __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/net/core/dev.c b/net/core/dev.c index 1421dad4cb29..b6a718ec11c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3962,7 +3962,13 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; gro_list_prepare(napi, skb); - NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + } else { + NAPI_GRO_CB(skb)->csum_valid = 0; + } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3975,6 +3981,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + NAPI_GRO_CB(skb)->encapsulation = 0; pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; @@ -4205,6 +4212,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. -- cgit v1.2.3 From 1933a7852ce6a81349855431b25122d7666bbfca Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:34:04 -0700 Subject: net: add gro_compute_pseudo functions Add inet_gro_compute_pseudo and ip6_gro_compute_pseudo. These are the logical equivalents of inet_compute_pseudo and ip6_compute_pseudo for GRO path. The IP header is taken from skb_gro_network_header. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip.h | 8 ++++++++ include/net/ip6_checksum.h | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index db4a771b9ef3..c8fd6112bd0b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -364,6 +364,14 @@ static inline void inet_set_txhash(struct sock *sk) sk->sk_txhash = flow_hash_from_keys(&keys); } +static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + return csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb_gro_len(skb), proto, 0); +} + /* * Map a multicast IP onto multicast MAC for type ethernet. */ diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index 55236cb71174..1a49b73f7f6e 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -48,6 +48,14 @@ static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) skb->len, proto, 0)); } +static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb_gro_len(skb), proto, 0)); +} + static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, -- cgit v1.2.3 From 57c67ff4bd92af634f7c91c40eb02a96dd785dda Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:34:44 -0700 Subject: udp: additional GRO support Implement GRO for UDPv6. Add UDP checksum verification in gro_receive for both UDP4 and UDP6 calling skb_gro_checksum_validate_zero_check. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/udp.h | 18 +++++++++++++++ net/ipv4/udp.c | 1 + net/ipv4/udp_offload.c | 61 ++++++++++++++++++++++++++++++++++++-------------- net/ipv6/udp_offload.c | 33 +++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 70f941368ace..16f4e80f0519 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -158,6 +158,24 @@ static inline __sum16 udp_v4_check(int len, __be32 saddr, void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); +struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, + struct udphdr *uh); +int udp_gro_complete(struct sk_buff *skb, int nhoff); + +static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) +{ + struct udphdr *uh; + unsigned int hlen, off; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) + uh = skb_gro_header_slow(skb, hlen, off); + + return uh; +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline void udp_lib_hash(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32f9571e776b..3549c21fe5f7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 59035bc3008d..8ed460e3753c 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -228,29 +228,22 @@ unlock: } EXPORT_SYMBOL(udp_del_offload); -static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, + struct udphdr *uh) { struct udp_offload_priv *uo_priv; struct sk_buff *p, **pp = NULL; - struct udphdr *uh, *uh2; - unsigned int hlen, off; + struct udphdr *uh2; + unsigned int off = skb_gro_offset(skb); int flush = 1; if (NAPI_GRO_CB(skb)->udp_mark || - (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + (!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the udp gro layer */ NAPI_GRO_CB(skb)->udp_mark = 1; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*uh); - uh = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - uh = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!uh)) - goto out; - } + NAPI_GRO_CB(skb)->encapsulation++; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); @@ -269,7 +262,12 @@ unflush: continue; uh2 = (struct udphdr *)(p->data + off); - if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + + /* Match ports and either checksums are either both zero + * or nonzero. + */ + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) || + (!uh->check ^ !uh2->check)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -286,7 +284,24 @@ out: return pp; } -static int udp_gro_complete(struct sk_buff *skb, int nhoff) +static struct sk_buff **udp4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_gro_udphdr(skb); + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (unlikely(!uh) || + (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return udp_gro_receive(head, skb, uh); +} + +int udp_gro_complete(struct sk_buff *skb, int nhoff) { struct udp_offload_priv *uo_priv; __be16 newlen = htons(skb->len - nhoff); @@ -311,12 +326,24 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff) return err; } +int udp4_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + + if (uh->check) + uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, + iph->daddr, 0); + + return udp_gro_complete(skb, nhoff); +} + static const struct net_offload udpv4_offload = { .callbacks = { .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, - .gro_receive = udp_gro_receive, - .gro_complete = udp_gro_complete, + .gro_receive = udp4_gro_receive, + .gro_complete = udp4_gro_complete, }, }; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 0ae3d98f83e0..b13e377e9c53 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -10,6 +10,7 @@ * UDPv6 GSO support */ #include +#include #include #include #include @@ -127,10 +128,42 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, out: return segs; } + +static struct sk_buff **udp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_gro_udphdr(skb); + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (unlikely(!uh) || + (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return udp_gro_receive(head, skb, uh); +} + +int udp6_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + + if (uh->check) + uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, + &ipv6h->daddr, 0); + + return udp_gro_complete(skb, nhoff); +} + static const struct net_offload udpv6_offload = { .callbacks = { .gso_send_check = udp6_ufo_send_check, .gso_segment = udp6_ufo_fragment, + .gro_receive = udp6_gro_receive, + .gro_complete = udp6_gro_complete, }, }; -- cgit v1.2.3 From a98406e22c12e514bac28fec0a49dc793edaf3a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Aug 2014 17:03:28 +0200 Subject: random32: improvements to prandom_bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch addresses a couple of minor items, mostly addesssing prandom_bytes(): 1) prandom_bytes{,_state}() should use size_t for length arguments, 2) We can use put_unaligned() when filling the array instead of open coding it [ perhaps some archs will further benefit from their own arch specific implementation when GCC cannot make up for it ], 3) Fix a typo, 4) Better use unsigned int as type for getting the arch seed, 5) Make use of prandom_u32_max() for timer slack. Regarding the change to put_unaligned(), callers of prandom_bytes() which internally invoke prandom_bytes_state(), don't bother as they expect the array to be filled randomly and don't have any control of the internal state what-so-ever (that's also why we have periodic reseeding there, etc), so they really don't care. Now for the direct callers of prandom_bytes_state(), which are solely located in test cases for MTD devices, that is, drivers/mtd/tests/{oobtest.c,pagetest.c,subpagetest.c}: These tests basically fill a test write-vector through prandom_bytes_state() with an a-priori defined seed each time and write that to a MTD device. Later on, they set up a read-vector and read back that blocks from the device. So in the verification phase, the write-vector is being re-setup [ so same seed and prandom_bytes_state() called ], and then memcmp()'ed against the read-vector to check if the data is the same. Akinobu, Lothar and I also tested this patch and it runs through the 3 relevant MTD test cases w/o any errors on the nandsim device (simulator for MTD devs) for x86_64, ppc64, ARM (i.MX28, i.MX53 and i.MX6): # modprobe nandsim first_id_byte=0x20 second_id_byte=0xac \ third_id_byte=0x00 fourth_id_byte=0x15 # modprobe mtd_oobtest dev=0 # modprobe mtd_pagetest dev=0 # modprobe mtd_subpagetest dev=0 We also don't have any users depending directly on a particular result of the PRNG (except the PRNG self-test itself), and that's just fine as it e.g. allowed us easily to do things like upgrading from taus88 to taus113. Signed-off-by: Daniel Borkmann Tested-by: Akinobu Mita Tested-by: Lothar Waßmann Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/random.h | 4 ++-- lib/random32.c | 39 ++++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/random.h b/include/linux/random.h index 57fbbffd77a0..b05856e16b75 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -26,7 +26,7 @@ unsigned int get_random_int(void); unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); u32 prandom_u32(void); -void prandom_bytes(void *buf, int nbytes); +void prandom_bytes(void *buf, size_t nbytes); void prandom_seed(u32 seed); void prandom_reseed_late(void); @@ -35,7 +35,7 @@ struct rnd_state { }; u32 prandom_u32_state(struct rnd_state *state); -void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); /** * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) diff --git a/lib/random32.c b/lib/random32.c index c9b6bf3afe0c..0bee183fa18f 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_RANDOM32_SELFTEST static void __init prandom_state_selftest(void); @@ -96,27 +97,23 @@ EXPORT_SYMBOL(prandom_u32); * This is used for pseudo-randomness with no outside seeding. * For more random results, use prandom_bytes(). */ -void prandom_bytes_state(struct rnd_state *state, void *buf, int bytes) +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { - unsigned char *p = buf; - int i; - - for (i = 0; i < round_down(bytes, sizeof(u32)); i += sizeof(u32)) { - u32 random = prandom_u32_state(state); - int j; + u8 *ptr = buf; - for (j = 0; j < sizeof(u32); j++) { - p[i + j] = random; - random >>= BITS_PER_BYTE; - } + while (bytes >= sizeof(u32)) { + put_unaligned(prandom_u32_state(state), (u32 *) ptr); + ptr += sizeof(u32); + bytes -= sizeof(u32); } - if (i < bytes) { - u32 random = prandom_u32_state(state); - for (; i < bytes; i++) { - p[i] = random; - random >>= BITS_PER_BYTE; - } + if (bytes > 0) { + u32 rem = prandom_u32_state(state); + do { + *ptr++ = (u8) rem; + bytes--; + rem >>= BITS_PER_BYTE; + } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); @@ -126,7 +123,7 @@ EXPORT_SYMBOL(prandom_bytes_state); * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes */ -void prandom_bytes(void *buf, int bytes) +void prandom_bytes(void *buf, size_t bytes) { struct rnd_state *state = &get_cpu_var(net_rand_state); @@ -137,7 +134,7 @@ EXPORT_SYMBOL(prandom_bytes); static void prandom_warmup(struct rnd_state *state) { - /* Calling RNG ten times to satify recurrence condition */ + /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); @@ -152,7 +149,7 @@ static void prandom_warmup(struct rnd_state *state) static u32 __extract_hwseed(void) { - u32 val = 0; + unsigned int val = 0; (void)(arch_get_random_seed_int(&val) || arch_get_random_int(&val)); @@ -228,7 +225,7 @@ static void __prandom_timer(unsigned long dontcare) prandom_seed(entropy); /* reseed every ~60 seconds, in [40 .. 80) interval with slack */ - expires = 40 + (prandom_u32() % 40); + expires = 40 + prandom_u32_max(40); seed_timer.expires = jiffies + msecs_to_jiffies(expires * MSEC_PER_SEC); add_timer(&seed_timer); -- cgit v1.2.3 From 4798248e4e023170e937a65a1d30fcc52496dd42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 22 Aug 2014 16:21:53 -0700 Subject: net: Add ops->ndo_xmit_flush() Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 2 +- drivers/usb/gadget/function/f_ncm.c | 2 +- include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++++++++ net/atm/mpc.c | 2 +- net/core/dev.c | 5 ++--- net/core/netpoll.c | 3 +-- net/core/pktgen.c | 4 +--- net/packet/af_packet.c | 3 +-- net/sched/sch_teql.c | 3 +-- 9 files changed, 44 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 43c9960dce1c..81b22a180aad 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -193,7 +193,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) struct dlci_local *dlp = netdev_priv(dev); if (skb) - dlp->slave->netdev_ops->ndo_start_xmit(skb, dlp->slave); + netdev_start_xmit(skb, dlp->slave); return NETDEV_TX_OK; } diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index bcdc882cd415..cb5d646db6a7 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1101,7 +1101,7 @@ static void ncm_tx_tasklet(unsigned long data) /* Only send if data is available. */ if (ncm->skb_tx_data) { ncm->timer_force_tx = true; - ncm->netdev->netdev_ops->ndo_start_xmit(NULL, ncm->netdev); + netdev_start_xmit(NULL, ncm->netdev); ncm->timer_force_tx = false; } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eb73444e1bd0..220c50984688 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -782,6 +782,19 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * + * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); + * A driver implements this function when it wishes to support + * deferred TX queue flushing. The idea is that the expensive + * operation to trigger TX queue processing can be done after + * N calls to ndo_start_xmit rather than being done every single + * time. In this regime ndo_start_xmit will be called one or more + * times, and then a final ndo_xmit_flush call will be made to + * have the driver tell the device about the new pending TX queue + * entries. The kernel keeps track of which queues need flushing + * by monitoring skb->queue_mapping of the packets it submits to + * ndo_start_xmit. This is the queue value that will be passed + * to ndo_xmit_flush. + * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -1005,6 +1018,7 @@ struct net_device_ops { int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); + void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -3430,6 +3444,27 @@ int __init dev_proc_init(void); #define dev_proc_init() 0 #endif +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, + struct sk_buff *skb, struct net_device *dev) +{ + netdev_tx_t ret; + u16 q; + + q = skb->queue_mapping; + ret = ops->ndo_start_xmit(skb, dev); + if (dev_xmit_complete(ret) && ops->ndo_xmit_flush) + ops->ndo_xmit_flush(dev, q); + + return ret; +} + +static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return __netdev_start_xmit(ops, skb, dev); +} + int netdev_class_create_file_ns(struct class_attribute *class_attr, const void *ns); void netdev_class_remove_file_ns(struct class_attribute *class_attr, diff --git a/net/atm/mpc.c b/net/atm/mpc.c index e8e0e7a8a23d..d662da161e5a 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return mpc->old_ops->ndo_start_xmit(skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/core/dev.c b/net/core/dev.c index b6a718ec11c1..26d296c2447c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2602,7 +2602,6 @@ EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; unsigned int skb_len; @@ -2667,7 +2666,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = ops->ndo_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); if (rc == NETDEV_TX_OK) txq_trans_update(txq); @@ -2686,7 +2685,7 @@ gso: skb_len = nskb->len; trace_net_dev_start_xmit(nskb, dev); - rc = ops->ndo_start_xmit(nskb, dev); + rc = netdev_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 907fb5e36c02..a5ad06828d67 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644); static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int status = NETDEV_TX_OK; netdev_features_t features; @@ -92,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = ops->ndo_start_xmit(skb, dev); + status = netdev_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b849ddfef2e..83e2b4b19eb7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3285,8 +3285,6 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; u16 queue_map; int ret; @@ -3339,7 +3337,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev); switch (ret) { case NETDEV_TX_OK: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 93896d2092f6..0dfa990d4eaa 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -240,7 +240,6 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; @@ -262,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = ops->ndo_start_xmit(skb, dev); + ret = netdev_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) txq_trans_update(txq); } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index bd33793b527e..64cd93ca8104 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -301,7 +301,6 @@ restart: do { struct net_device *slave = qdisc_dev(q); struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); - const struct net_device_ops *slave_ops = slave->netdev_ops; if (slave_txq->qdisc_sleeping != q) continue; @@ -317,7 +316,7 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { + netdev_start_xmit(skb, slave) == NETDEV_TX_OK) { txq_trans_update(slave_txq); __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); -- cgit v1.2.3 From 48ea526a6877d605c961aa37fae33f3227b29424 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 25 Aug 2014 16:06:53 +0300 Subject: net/mlx4: Use is_kdump_kernel() to detect kdump kernel Use is_kdump_kernel() to detect kdump kernel, instead of reset_devices. Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 071f6b234604..783dd099abd1 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -1275,7 +1276,7 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, /* Returns true if running in low memory profile (kdump kernel) */ static inline bool mlx4_low_memory_profile(void) { - return reset_devices; + return is_kdump_kernel(); } #endif /* MLX4_DEVICE_H */ -- cgit v1.2.3 From 0b725a2ca61bedc33a2a63d0451d528b268cf975 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 25 Aug 2014 15:51:53 -0700 Subject: net: Remove ndo_xmit_flush netdev operation, use signalling instead. As reported by Jesper Dangaard Brouer, for high packet rates the overhead of having another indirect call in the TX path is non-trivial. There is the indirect call itself, and then there is all of the reloading of the state to refetch the tail pointer value and then write the device register. Move to a more passive scheme, which requires very light modifications to the device drivers. The signal is a new skb->xmit_more value, if it is non-zero it means that more SKBs are pending to be transmitted on the same queue as the current SKB. And therefore, the driver may elide the tail pointer update. Right now skb->xmit_more is always zero. Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 36 +++++++++++-------------------- drivers/net/virtio_net.c | 12 +++-------- include/linux/netdevice.h | 25 ++------------------- include/linux/skbuff.h | 2 ++ 4 files changed, 19 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index b9c020a05fb8..89c29b40d61c 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -136,7 +136,6 @@ static void igb_update_phy_info(unsigned long); static void igb_watchdog(unsigned long); static void igb_watchdog_task(struct work_struct *); static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *); -static void igb_xmit_flush(struct net_device *netdev, u16 queue); static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats); static int igb_change_mtu(struct net_device *, int); @@ -2076,7 +2075,6 @@ static const struct net_device_ops igb_netdev_ops = { .ndo_open = igb_open, .ndo_stop = igb_close, .ndo_start_xmit = igb_xmit_frame, - .ndo_xmit_flush = igb_xmit_flush, .ndo_get_stats64 = igb_get_stats64, .ndo_set_rx_mode = igb_set_rx_mode, .ndo_set_mac_address = igb_set_mac, @@ -4917,6 +4915,14 @@ static void igb_tx_map(struct igb_ring *tx_ring, tx_ring->next_to_use = i; + if (!skb->xmit_more) { + writel(i, tx_ring->tail); + + /* we need this if more than one processor can write to our tail + * at a time, it synchronizes IO on IA64/Altix systems + */ + mmiowb(); + } return; dma_error: @@ -5052,20 +5058,17 @@ out_drop: return NETDEV_TX_OK; } -static struct igb_ring *__igb_tx_queue_mapping(struct igb_adapter *adapter, unsigned int r_idx) +static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter, + struct sk_buff *skb) { + unsigned int r_idx = skb->queue_mapping; + if (r_idx >= adapter->num_tx_queues) r_idx = r_idx % adapter->num_tx_queues; return adapter->tx_ring[r_idx]; } -static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter, - struct sk_buff *skb) -{ - return __igb_tx_queue_mapping(adapter, skb->queue_mapping); -} - static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) { @@ -5094,21 +5097,6 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb)); } -static void igb_xmit_flush(struct net_device *netdev, u16 queue) -{ - struct igb_adapter *adapter = netdev_priv(netdev); - struct igb_ring *tx_ring; - - tx_ring = __igb_tx_queue_mapping(adapter, queue); - - writel(tx_ring->next_to_use, tx_ring->tail); - - /* we need this if more than one processor can write to our tail - * at a time, it synchronizes IO on IA64/Altix systems - */ - mmiowb(); -} - /** * igb_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 62421086d3e6..f0c2824f5e0f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -953,15 +953,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) } } - return NETDEV_TX_OK; -} + if (!skb->xmit_more) + virtqueue_kick(sq->vq); -static void xmit_flush(struct net_device *dev, u16 qnum) -{ - struct virtnet_info *vi = netdev_priv(dev); - struct send_queue *sq = &vi->sq[qnum]; - - virtqueue_kick(sq->vq); + return NETDEV_TX_OK; } /* @@ -1393,7 +1388,6 @@ static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, .ndo_start_xmit = start_xmit, - .ndo_xmit_flush = xmit_flush, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = virtnet_set_mac_address, .ndo_set_rx_mode = virtnet_set_rx_mode, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 220c50984688..039b23786c22 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -782,19 +782,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * - * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); - * A driver implements this function when it wishes to support - * deferred TX queue flushing. The idea is that the expensive - * operation to trigger TX queue processing can be done after - * N calls to ndo_start_xmit rather than being done every single - * time. In this regime ndo_start_xmit will be called one or more - * times, and then a final ndo_xmit_flush call will be made to - * have the driver tell the device about the new pending TX queue - * entries. The kernel keeps track of which queues need flushing - * by monitoring skb->queue_mapping of the packets it submits to - * ndo_start_xmit. This is the queue value that will be passed - * to ndo_xmit_flush. - * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -1018,7 +1005,6 @@ struct net_device_ops { int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); - void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -3447,15 +3433,8 @@ int __init dev_proc_init(void); static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev) { - netdev_tx_t ret; - u16 q; - - q = skb->queue_mapping; - ret = ops->ndo_start_xmit(skb, dev); - if (dev_xmit_complete(ret) && ops->ndo_xmit_flush) - ops->ndo_xmit_flush(dev, q); - - return ret; + skb->xmit_more = 0; + return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 18ddf9684a27..9b3802a197a8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -452,6 +452,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @tc_verd: traffic control verdict * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices + * @xmit_more: More SKBs are pending for this queue * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -558,6 +559,7 @@ struct sk_buff { __u16 queue_mapping; kmemcheck_bitfield_begin(flags2); + __u8 xmit_more:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif -- cgit v1.2.3 From 453a940ea725d692282f9e66475cec0d1b1e12f2 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Aug 2014 17:03:47 -0700 Subject: net: make skb an optional parameter for__skb_flow_dissect() Fixes: commit 690e36e726d00d2 (net: Allow raw buffers to be passed into the flow dissector) Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/flow_keys.h | 4 ++-- net/core/flow_dissector.c | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 4040f63932c5..9a03f73c4974 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -28,10 +28,10 @@ struct flow_keys { }; bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, - void *data, int hlen); + void *data, __be16 proto, int nhoff, int hlen); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) { - return __skb_flow_dissect(skb, flow, NULL, 0); + return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0); } __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, void *data, int hlen_proto); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ae8f0db67aed..12f48ca7a0b0 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -59,14 +59,26 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, void *data, int hlen) +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the struct flow_keys from either the skbuff + * or a raw buffer specified by the rest parameters + */ +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, __be16 proto, int nhoff, int hlen) { - int nhoff = skb_network_offset(skb); u8 ip_proto; - __be16 proto = skb->protocol; if (!data) { data = skb->data; + proto = skb->protocol; + nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); } -- cgit v1.2.3 From 970fdfa89babb5a6f1a3d345e8cb54d92c1e3a8f Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Mon, 11 Aug 2014 03:29:57 -0700 Subject: cfg80211: remove @gfp parameter from cfg80211_rx_mgmt() In the cfg80211_rx_mgmt(), parameter @gfp was used for the memory allocation. But, memory get allocated under spin_lock_bh(), this implies atomic context. So, one can't use GFP_KERNEL, only variants with no __GFP_WAIT. Actually, in all occurrences GFP_ATOMIC is used (wil6210 use GFP_KERNEL by mistake), and it should be this way or warning triggered in the memory allocation code. Remove @gfp parameter as no actual choice exist, and use hard coded GFP_ATOMIC for memory allocation. Signed-off-by: Vladimir Kondratiev Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/wmi.c | 5 ++--- drivers/net/wireless/ath/wil6210/wmi.c | 2 +- drivers/net/wireless/brcm80211/brcmfmac/p2p.c | 6 ++---- drivers/net/wireless/mwifiex/util.c | 2 +- drivers/staging/rtl8723au/core/rtw_mlme_ext.c | 2 +- drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c | 4 ++-- include/net/cfg80211.h | 3 +-- net/mac80211/rx.c | 2 +- net/wireless/mlme.c | 4 ++-- 9 files changed, 13 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 94df345d08c2..77fcca1f5bd6 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -619,8 +619,7 @@ static int ath6kl_wmi_rx_probe_req_event_rx(struct wmi *wmi, u8 *datap, int len, dlen, freq, vif->probe_req_report); if (vif->probe_req_report || vif->nw_type == AP_NETWORK) - cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, - GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0); return 0; } @@ -659,7 +658,7 @@ static int ath6kl_wmi_rx_action_event_rx(struct wmi *wmi, u8 *datap, int len, return -EINVAL; } ath6kl_dbg(ATH6KL_DBG_WMI, "rx_action: len=%u freq=%u\n", dlen, freq); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0); return 0; } diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 1d1d0afdd2e1..335bc38325db 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -350,7 +350,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len) } } else { cfg80211_rx_mgmt(wil->wdev, freq, signal, - (void *)rx_mgmt_frame, d_len, 0, GFP_KERNEL); + (void *)rx_mgmt_frame, d_len, 0); } } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 057b982ea8b3..1d78a91db594 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1431,8 +1431,7 @@ int brcmf_p2p_notify_action_frame_rx(struct brcmf_if *ifp, IEEE80211_BAND_5GHZ); wdev = &ifp->vif->wdev; - cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0, - GFP_ATOMIC); + cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0); kfree(mgmt_frame); return 0; @@ -1896,8 +1895,7 @@ s32 brcmf_p2p_notify_rx_mgmt_p2p_probereq(struct brcmf_if *ifp, IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0, - GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0); brcmf_dbg(INFO, "mgmt_frame_len (%d) , e->datalen (%d), chanspec (%04x), freq (%d)\n", mgmt_frame_len, e->datalen, chanspec, freq); diff --git a/drivers/net/wireless/mwifiex/util.c b/drivers/net/wireless/mwifiex/util.c index cee028321a9a..ec79c49de097 100644 --- a/drivers/net/wireless/mwifiex/util.c +++ b/drivers/net/wireless/mwifiex/util.c @@ -172,7 +172,7 @@ mwifiex_process_mgmt_packet(struct mwifiex_private *priv, cfg80211_rx_mgmt(priv->wdev, priv->roc_cfg.chan.center_freq, CAL_RSSI(rx_pd->snr, rx_pd->nf), skb->data, pkt_len, - 0, GFP_ATOMIC); + 0); return 0; } diff --git a/drivers/staging/rtl8723au/core/rtw_mlme_ext.c b/drivers/staging/rtl8723au/core/rtw_mlme_ext.c index c5fdcb89dacd..2a1502f351f8 100644 --- a/drivers/staging/rtl8723au/core/rtw_mlme_ext.c +++ b/drivers/staging/rtl8723au/core/rtw_mlme_ext.c @@ -2128,7 +2128,7 @@ static int on_action_public23a(struct rtw_adapter *padapter, IEEE80211_BAND_5GHZ); if (cfg80211_rx_mgmt(padapter->rtw_wdev, freq, 0, pframe, - skb->len, 0, GFP_ATOMIC)) + skb->len, 0)) return _SUCCESS; return _FAIL; diff --git a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c index 93dc844a10b3..2d6d7d1a5b7d 100644 --- a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c @@ -2379,7 +2379,7 @@ void rtw_cfg80211_indicate_sta_assoc(struct rtw_adapter *padapter, IEEE80211_BAND_5GHZ); cfg80211_rx_mgmt(padapter->rtw_wdev, freq, 0, pmgmt_frame, frame_len, - 0, GFP_ATOMIC); + 0); #endif /* defined(RTW_USE_CFG80211_STA_EVENT) */ } @@ -2425,7 +2425,7 @@ void rtw_cfg80211_indicate_sta_disassoc(struct rtw_adapter *padapter, frame_len = sizeof(struct ieee80211_hdr_3addr) + 2; cfg80211_rx_mgmt(padapter->rtw_wdev, freq, 0, (u8 *)&mgmt, frame_len, - 0, GFP_ATOMIC); + 0); #endif /* defined(RTW_USE_CFG80211_STA_EVENT) */ } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0a080c4de275..7b8dac3efe8f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4412,7 +4412,6 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * @buf: Management frame (header + body) * @len: length of the frame data * @flags: flags, as defined in enum nl80211_rxmgmt_flags - * @gfp: context flags * * This function is called whenever an Action frame is received for a station * mode interface, but is not processed in kernel. @@ -4423,7 +4422,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * driver is responsible for rejecting the frame. */ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp); + const u8 *buf, size_t len, u32 flags); /** * cfg80211_mgmt_tx_status - notification of TX status for management frame diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bd2c9b22c945..a8d862f9183c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2725,7 +2725,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) { + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 266766b8d80b..369fc334fdad 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -605,7 +605,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, } bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp) + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -648,7 +648,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, freq, sig_mbm, - buf, len, flags, gfp)) + buf, len, flags, GFP_ATOMIC)) continue; result = true; -- cgit v1.2.3 From 170fd0b1f6108b48df4369afa0ee29a83e922748 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 30 Jul 2014 14:36:18 +0300 Subject: ieee80211: Support parsing TPC report element in action frames TPC report element is contained in spectrum management's tpc report action frames and in radio measurement's link measurement report action frames. Add a function which checks whether an action frame contains this element. This may be needed by the drivers in order to set the correct tx power value in these frames. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 63ab3873c5ed..8018c915ee63 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -838,6 +838,16 @@ enum ieee80211_vht_opmode_bits { #define WLAN_SA_QUERY_TR_ID_LEN 2 +/** + * struct ieee80211_tpc_report_ie + * + * This structure refers to "TPC Report element" + */ +struct ieee80211_tpc_report_ie { + u8 tx_power; + u8 link_margin; +} __packed; + struct ieee80211_mgmt { __le16 frame_control; __le16 duration; @@ -973,6 +983,13 @@ struct ieee80211_mgmt { u8 action_code; u8 operating_mode; } __packed vht_opmode_notif; + struct { + u8 action_code; + u8 dialog_token; + u8 tpc_elem_id; + u8 tpc_elem_length; + struct ieee80211_tpc_report_ie tpc; + } __packed tpc_report; } u; } __packed action; } u; @@ -1865,6 +1882,7 @@ enum ieee80211_category { WLAN_CATEGORY_DLS = 2, WLAN_CATEGORY_BACK = 3, WLAN_CATEGORY_PUBLIC = 4, + WLAN_CATEGORY_RADIO_MEASUREMENT = 5, WLAN_CATEGORY_HT = 7, WLAN_CATEGORY_SA_QUERY = 8, WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9, @@ -2378,4 +2396,51 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, #define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024)) #define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x)) +/** + * ieee80211_action_contains_tpc - checks if the frame contains TPC element + * @skb: the skb containing the frame, length will be checked + * + * This function checks if it's either TPC report action frame or Link + * Measurement report action frame as defined in IEEE Std. 802.11-2012 8.5.2.5 + * and 8.5.7.5 accordingly. + */ +static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (!ieee80211_is_action(mgmt->frame_control)) + return false; + + if (skb->len < IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.tpc_report)) + return false; + + /* + * TPC report - check that: + * category = 0 (Spectrum Management) or 5 (Radio Measurement) + * spectrum management action = 3 (TPC/Link Measurement report) + * TPC report EID = 35 + * TPC report element length = 2 + * + * The spectrum management's tpc_report struct is used here both for + * parsing tpc_report and radio measurement's link measurement report + * frame, since the relevant part is identical in both frames. + */ + if (mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT && + mgmt->u.action.category != WLAN_CATEGORY_RADIO_MEASUREMENT) + return false; + + /* both spectrum mgmt and link measurement have same action code */ + if (mgmt->u.action.u.tpc_report.action_code != + WLAN_ACTION_SPCT_TPC_RPRT) + return false; + + if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || + mgmt->u.action.u.tpc_report.tpc_elem_length != + sizeof(struct ieee80211_tpc_report_ie)) + return false; + + return true; +} + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From ca34e3b5c808385b175650605faa29e71e91991b Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Tue, 29 Jul 2014 15:38:53 +0300 Subject: mac80211: Fix accounting of the tailroom-needed counter When hw acceleration is enabled, the GENERATE_IV or PUT_IV_SPACE flags will only require headroom space. Consequently, the tailroom-needed counter can safely be decremented. Signed-off-by: Ido Yariv Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 +++++-- net/mac80211/key.c | 12 +++--------- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 1cd84444665c..1fbed0a6d556 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1226,7 +1226,8 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * * @IEEE80211_KEY_FLAG_GENERATE_IV: This flag should be set by the * driver to indicate that it requires IV generation for this - * particular key. + * particular key. Setting this flag does not necessarily mean that SKBs + * will have sufficient tailroom for ICV or MIC. * @IEEE80211_KEY_FLAG_GENERATE_MMIC: This flag should be set by * the driver for a TKIP key if it requires Michael MIC * generation in software. @@ -1238,7 +1239,9 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * @IEEE80211_KEY_FLAG_PUT_IV_SPACE: This flag should be set by the driver * if space should be prepared for the IV, but the IV * itself should not be generated. Do not set together with - * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. + * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. Setting this flag does + * not necessarily mean that SKBs will have sufficient tailroom for ICV or + * MIC. * @IEEE80211_KEY_FLAG_RX_MGMT: This key will be used to decrypt received * management frames. The flag can help drivers that have a hardware * crypto implementation that doesn't deal with management frames diff --git a/net/mac80211/key.c b/net/mac80211/key.c index d808cff80153..6429d0e1d4a1 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -130,9 +130,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) sdata->crypto_tx_tailroom_needed_cnt--; WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && @@ -180,9 +178,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta = key->sta; sdata = key->sdata; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) increment_tailroom_need_count(sdata); ret = drv_set_key(key->local, DISABLE_KEY, sdata, @@ -878,9 +874,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) increment_tailroom_need_count(key->sdata); } -- cgit v1.2.3 From c70f59a2a007c57843195a93c3b7308204e0a5ab Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Tue, 29 Jul 2014 15:39:14 +0300 Subject: mac80211: don't resize skbs needlessly Header-less cloned skbs with sufficient headroom need not be cloned unless the tailroom is going to be modified. Fix ieee80211_skb_resize so it would only resize cloned skbs if either the header isn't released or the tailroom is going to be modified. Some drivers might have assumed that skbs are never cloned, so add a HW flag that explicitly permits cloned TX skbs. Drivers which do not modify TX skbs should set this flag to avoid copying skbs. Signed-off-by: Ido Yariv Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 ++++- net/mac80211/tx.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 1fbed0a6d556..c9b2bec8db47 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1609,6 +1609,9 @@ struct ieee80211_tx_control { * is not enabled the default action is to disconnect when getting the * CSA frame. * + * @IEEE80211_HW_SUPPORTS_CLONED_SKBS: The driver will never modify the payload + * or tailroom of TX skbs without copying them first. + * * @IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. */ @@ -1642,7 +1645,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TIMING_BEACON_ONLY = 1<<26, IEEE80211_HW_SUPPORTS_HT_CCK_RATES = 1<<27, IEEE80211_HW_CHANCTX_STA_CSA = 1<<28, - /* bit 29 unused */ + IEEE80211_HW_SUPPORTS_CLONED_SKBS = 1<<29, IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS = 1<<30, }; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2051cc60f8ce..925c39f4099e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1478,7 +1478,10 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, tail_need = max_t(int, tail_need, 0); } - if (skb_cloned(skb)) + if (skb_cloned(skb) && + (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || + !skb_clone_writable(skb, ETH_HLEN) || + sdata->crypto_tx_tailroom_needed_cnt)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); -- cgit v1.2.3 From 0e227084aee36b3ba27b4fc9cd9e425be6ce2ab8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 12 Aug 2014 20:34:30 +0200 Subject: cfg80211: clarify BSS probe response vs. beacon data There are a few possible cases of where BSS data came from: 1) only a beacon has been received 2) only a probe response has been received 3) the driver didn't report what it received (this happens when using cfg80211_inform_bss[_width]()) 4) both probe response and beacon data has been received Unfortunately, in the userspace API, a few things weren't there: a) there was no way to differentiate cases 1) and 4) above without comparing the data of the IEs b) the TSF was always from the last frame, instead of being exposed for beacon/probe response separately like IEs Fix this by i) exporting a new flag attribute that indicates whether or not probe response data has been received - this addresses (a) ii) exporting a BEACON_TSF attribute that holds the beacon's TSF if a beacon has been received iii) not exporting the beacon attributes in case (3) above as that would just lead userspace into thinking the data actually came from a beacon when that isn't clear To implement this, track inside the IEs struct whether or not it (definitely) came from a beacon. Reported-by: William Seto Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 16 ++++++++++++++-- net/wireless/nl80211.c | 16 ++++++++++++---- net/wireless/scan.c | 6 ++++-- 4 files changed, 32 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b8dac3efe8f..77b85a89abca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1503,12 +1503,14 @@ enum cfg80211_signal_type { * @tsf: TSF contained in the frame that carried these IEs * @rcu_head: internal use, for freeing * @len: length of the IEs + * @from_beacon: these IEs are known to come from a beacon * @data: IE data */ struct cfg80211_bss_ies { u64 tsf; struct rcu_head rcu_head; int len; + bool from_beacon; u8 data[]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f1db15b9c041..d097568da690 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3055,14 +3055,20 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_BSSID: BSSID of the BSS (6 octets) * @NL80211_BSS_FREQUENCY: frequency in MHz (u32) * @NL80211_BSS_TSF: TSF of the received probe response/beacon (u64) + * (if @NL80211_BSS_PRESP_DATA is present then this is known to be + * from a probe response, otherwise it may be from the same beacon + * that the NL80211_BSS_BEACON_TSF will be from) * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16) * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16) * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the * raw information elements from the probe response/beacon (bin); - * if the %NL80211_BSS_BEACON_IES attribute is present, the IEs here are - * from a Probe Response frame; otherwise they are from a Beacon frame. + * if the %NL80211_BSS_BEACON_IES attribute is present and the data is + * different then the IEs here are from a Probe Response frame; otherwise + * they are from a Beacon frame. * However, if the driver does not indicate the source of the IEs, these * IEs may be from either frame subtype. + * If present, the @NL80211_BSS_PRESP_DATA attribute indicates that the + * data here is known to be from a probe response, without any heuristics. * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon * in mBm (100 * dBm) (s32) * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon @@ -3074,6 +3080,10 @@ enum nl80211_bss_scan_width { * yet been received * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel * (u32, enum nl80211_bss_scan_width) + * @NL80211_BSS_BEACON_TSF: TSF of the last received beacon (u64) + * (not present if no beacon frame has been received yet) + * @NL80211_BSS_PRESP_DATA: the data in @NL80211_BSS_INFORMATION_ELEMENTS and + * @NL80211_BSS_TSF is known to be from a probe response (flag attribute) * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -3091,6 +3101,8 @@ enum nl80211_bss { NL80211_BSS_SEEN_MS_AGO, NL80211_BSS_BEACON_IES, NL80211_BSS_CHAN_WIDTH, + NL80211_BSS_BEACON_TSF, + NL80211_BSS_PRESP_DATA, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index df7b1332a1ec..3011401f52c0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6033,7 +6033,6 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, const struct cfg80211_bss_ies *ies; void *hdr; struct nlattr *bss; - bool tsf = false; ASSERT_WDEV_LOCK(wdev); @@ -6060,18 +6059,27 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; rcu_read_lock(); + /* indicate whether we have probe response data or not */ + if (rcu_access_pointer(res->proberesp_ies) && + nla_put_flag(msg, NL80211_BSS_PRESP_DATA)) + goto fail_unlock_rcu; + + /* this pointer prefers to be pointed to probe response data + * but is always valid + */ ies = rcu_dereference(res->ies); if (ies) { if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) goto fail_unlock_rcu; - tsf = true; if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, ies->len, ies->data)) goto fail_unlock_rcu; } + + /* and this pointer is always (unless driver didn't know) beacon data */ ies = rcu_dereference(res->beacon_ies); - if (ies) { - if (!tsf && nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + if (ies && ies->from_beacon) { + if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf)) goto fail_unlock_rcu; if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, ies->len, ies->data)) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0798c62e6085..ad1a1a2808d3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -918,11 +918,12 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = tsf; + ies->from_beacon = false; memcpy(ies->data, ie, ielen); rcu_assign_pointer(tmp.pub.beacon_ies, ies); @@ -982,11 +983,12 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, if (!channel) return NULL; - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); + ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control); memcpy(ies->data, mgmt->u.probe_resp.variable, ielen); if (ieee80211_is_probe_resp(mgmt->frame_control)) -- cgit v1.2.3 From 5bc8c1f2b070bab82ed738f98ecfac725e33c57f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 12 Aug 2014 21:01:28 +0200 Subject: cfg80211: allow passing frame type to cfg80211_inform_bss() When using the cfg80211_inform_bss[_width]() functions drivers cannot currently indicate whether the data was received in a beacon or probe response. Fix that by passing a new enum that indicates such (or unknown). For good measure, use it in ath6kl. Acked-by: Kalle Valo [ath6kl] Acked-by: Arend van Spriel [brcmfmac] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 1 + drivers/net/wireless/ath/ath6kl/wmi.c | 43 +++++----------------- drivers/net/wireless/ath/wil6210/wmi.c | 2 +- .../net/wireless/brcm80211/brcmfmac/wl_cfg80211.c | 18 ++++++--- drivers/net/wireless/libertas/cfg.c | 2 + drivers/net/wireless/mwifiex/cfg80211.c | 1 + drivers/net/wireless/mwifiex/scan.c | 3 +- drivers/net/wireless/orinoco/scan.c | 14 ++++--- drivers/net/wireless/rndis_wlan.c | 14 ++++--- drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c | 1 + drivers/staging/wlan-ng/cfg80211.c | 1 + include/net/cfg80211.h | 20 +++++++++- net/wireless/scan.c | 15 +++++++- 13 files changed, 77 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index e535807c3d89..ba60e37213eb 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -717,6 +717,7 @@ ath6kl_add_bss_if_needed(struct ath6kl_vif *vif, memcpy(ie + 2, vif->ssid, vif->ssid_len); memcpy(ie + 2 + vif->ssid_len, beacon_ie, beacon_ie_len); bss = cfg80211_inform_bss(ar->wiphy, chan, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, 0, cap_val, 100, ie, 2 + vif->ssid_len + beacon_ie_len, 0, GFP_KERNEL); diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 77fcca1f5bd6..b921005ad7ee 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -1092,7 +1092,6 @@ static int ath6kl_wmi_bssinfo_event_rx(struct wmi *wmi, u8 *datap, int len, u8 *buf; struct ieee80211_channel *channel; struct ath6kl *ar = wmi->parent_dev; - struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; if (len <= sizeof(struct wmi_bss_info_hdr2)) @@ -1138,39 +1137,15 @@ static int ath6kl_wmi_bssinfo_event_rx(struct wmi *wmi, u8 *datap, int len, } } - /* - * In theory, use of cfg80211_inform_bss() would be more natural here - * since we do not have the full frame. However, at least for now, - * cfg80211 can only distinguish Beacon and Probe Response frames from - * each other when using cfg80211_inform_bss_frame(), so let's build a - * fake IEEE 802.11 header to be able to take benefit of this. - */ - mgmt = kmalloc(24 + len, GFP_ATOMIC); - if (mgmt == NULL) - return -EINVAL; - - if (bih->frame_type == BEACON_FTYPE) { - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_BEACON); - memset(mgmt->da, 0xff, ETH_ALEN); - } else { - struct net_device *dev = vif->ndev; - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_PROBE_RESP); - memcpy(mgmt->da, dev->dev_addr, ETH_ALEN); - } - mgmt->duration = cpu_to_le16(0); - memcpy(mgmt->sa, bih->bssid, ETH_ALEN); - memcpy(mgmt->bssid, bih->bssid, ETH_ALEN); - mgmt->seq_ctrl = cpu_to_le16(0); - - memcpy(&mgmt->u.beacon, buf, len); - - bss = cfg80211_inform_bss_frame(ar->wiphy, channel, mgmt, - 24 + len, (bih->snr - 95) * 100, - GFP_ATOMIC); - kfree(mgmt); + bss = cfg80211_inform_bss(ar->wiphy, channel, + bih->frame_type == BEACON_FTYPE ? + CFG80211_BSS_FTYPE_BEACON : + CFG80211_BSS_FTYPE_PRESP, + bih->bssid, get_unaligned_le64((__le64 *)buf), + get_unaligned_le16(((__le16 *)buf) + 5), + get_unaligned_le16(((__le16 *)buf) + 4), + buf + 8 + 2 + 2, len - 8 - 2 - 2, + (bih->snr - 95) * 100, GFP_ATOMIC); if (bss == NULL) return -ENOMEM; cfg80211_put_bss(ar->wiphy, bss); diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 335bc38325db..960b66fc1430 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -346,7 +346,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len) rx_mgmt_frame->bssid); cfg80211_put_bss(wiphy, bss); } else { - wil_err(wil, "cfg80211_inform_bss() failed\n"); + wil_err(wil, "cfg80211_inform_bss_frame() failed\n"); } } else { cfg80211_rx_mgmt(wil->wdev, freq, signal, diff --git a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c index 02fe706fc9ec..12a60ca1462a 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c @@ -2394,9 +2394,13 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg, brcmf_dbg(CONN, "Beacon interval: %d\n", notify_interval); brcmf_dbg(CONN, "Signal: %d\n", notify_signal); - bss = cfg80211_inform_bss(wiphy, notify_channel, (const u8 *)bi->BSSID, - 0, notify_capability, notify_interval, notify_ie, - notify_ielen, notify_signal, GFP_KERNEL); + bss = cfg80211_inform_bss(wiphy, notify_channel, + CFG80211_BSS_FTYPE_UNKNOWN, + (const u8 *)bi->BSSID, + 0, notify_capability, + notify_interval, notify_ie, + notify_ielen, notify_signal, + GFP_KERNEL); if (!bss) return -ENOMEM; @@ -2498,9 +2502,11 @@ static s32 wl_inform_ibss(struct brcmf_cfg80211_info *cfg, brcmf_dbg(CONN, "beacon interval: %d\n", notify_interval); brcmf_dbg(CONN, "signal: %d\n", notify_signal); - bss = cfg80211_inform_bss(wiphy, notify_channel, bssid, - 0, notify_capability, notify_interval, - notify_ie, notify_ielen, notify_signal, GFP_KERNEL); + bss = cfg80211_inform_bss(wiphy, notify_channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, 0, + notify_capability, notify_interval, + notify_ie, notify_ielen, notify_signal, + GFP_KERNEL); if (!bss) { err = -ENOMEM; diff --git a/drivers/net/wireless/libertas/cfg.c b/drivers/net/wireless/libertas/cfg.c index 47a998d8f99e..22884ba7d6cc 100644 --- a/drivers/net/wireless/libertas/cfg.c +++ b/drivers/net/wireless/libertas/cfg.c @@ -653,6 +653,7 @@ static int lbs_ret_scan(struct lbs_private *priv, unsigned long dummy, if (channel && !(channel->flags & IEEE80211_CHAN_DISABLED)) { bss = cfg80211_inform_bss(wiphy, channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, get_unaligned_le64(tsfdesc), capa, intvl, ie, ielen, LBS_SCAN_RSSI_TO_MBM(rssi), @@ -1754,6 +1755,7 @@ static void lbs_join_post(struct lbs_private *priv, bss = cfg80211_inform_bss(priv->wdev->wiphy, params->chandef.chan, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, 0, capability, diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index e2e6bf13c2d8..15f994f3b4ce 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c @@ -1557,6 +1557,7 @@ static int mwifiex_cfg80211_inform_ibss_bss(struct mwifiex_private *priv) band)); bss = cfg80211_inform_bss(priv->wdev->wiphy, chan, + CFG80211_BSS_FTYPE_UNKNOWN, bss_info.bssid, 0, WLAN_CAPABILITY_IBSS, 0, ie_buf, ie_len, 0, GFP_KERNEL); cfg80211_put_bss(priv->wdev->wiphy, bss); diff --git a/drivers/net/wireless/mwifiex/scan.c b/drivers/net/wireless/mwifiex/scan.c index dee717a19ddb..195ef0ca343f 100644 --- a/drivers/net/wireless/mwifiex/scan.c +++ b/drivers/net/wireless/mwifiex/scan.c @@ -1719,7 +1719,8 @@ mwifiex_parse_single_response_buf(struct mwifiex_private *priv, u8 **bss_info, if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) { bss = cfg80211_inform_bss(priv->wdev->wiphy, - chan, bssid, timestamp, + chan, CFG80211_BSS_FTYPE_UNKNOWN, + bssid, timestamp, cap_info_bitmap, beacon_period, ie_buf, ie_len, rssi, GFP_KERNEL); bss_priv = (struct mwifiex_bss_priv *)bss->priv; diff --git a/drivers/net/wireless/orinoco/scan.c b/drivers/net/wireless/orinoco/scan.c index e175b9b8561b..2c66166add70 100644 --- a/drivers/net/wireless/orinoco/scan.c +++ b/drivers/net/wireless/orinoco/scan.c @@ -123,9 +123,10 @@ static void orinoco_add_hostscan_result(struct orinoco_private *priv, beacon_interval = le16_to_cpu(bss->a.beacon_interv); signal = SIGNAL_TO_MBM(le16_to_cpu(bss->a.level)); - cbss = cfg80211_inform_bss(wiphy, channel, bss->a.bssid, timestamp, - capability, beacon_interval, ie_buf, ie_len, - signal, GFP_KERNEL); + cbss = cfg80211_inform_bss(wiphy, channel, CFG80211_BSS_FTYPE_UNKNOWN, + bss->a.bssid, timestamp, capability, + beacon_interval, ie_buf, ie_len, signal, + GFP_KERNEL); cfg80211_put_bss(wiphy, cbss); } @@ -156,9 +157,10 @@ void orinoco_add_extscan_result(struct orinoco_private *priv, ie = bss->data; signal = SIGNAL_TO_MBM(bss->level); - cbss = cfg80211_inform_bss(wiphy, channel, bss->bssid, timestamp, - capability, beacon_interval, ie, ie_len, - signal, GFP_KERNEL); + cbss = cfg80211_inform_bss(wiphy, channel, CFG80211_BSS_FTYPE_UNKNOWN, + bss->bssid, timestamp, capability, + beacon_interval, ie, ie_len, signal, + GFP_KERNEL); cfg80211_put_bss(wiphy, cbss); } diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c index d2a9a08210be..1a4facd1fbf3 100644 --- a/drivers/net/wireless/rndis_wlan.c +++ b/drivers/net/wireless/rndis_wlan.c @@ -2022,9 +2022,10 @@ static bool rndis_bss_info_update(struct usbnet *usbdev, capability = le16_to_cpu(fixed->capabilities); beacon_interval = le16_to_cpu(fixed->beacon_interval); - bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, bssid->mac, - timestamp, capability, beacon_interval, ie, ie_len, signal, - GFP_KERNEL); + bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid->mac, + timestamp, capability, beacon_interval, + ie, ie_len, signal, GFP_KERNEL); cfg80211_put_bss(priv->wdev.wiphy, bss); return (bss != NULL); @@ -2711,9 +2712,10 @@ static void rndis_wlan_craft_connected_bss(struct usbnet *usbdev, u8 *bssid, bssid, (u32)timestamp, capability, beacon_period, ie_len, ssid.essid, signal); - bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, bssid, - timestamp, capability, beacon_period, ie_buf, ie_len, - signal, GFP_KERNEL); + bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, + timestamp, capability, beacon_period, + ie_buf, ie_len, signal, GFP_KERNEL); cfg80211_put_bss(priv->wdev.wiphy, bss); } diff --git a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c index 2d6d7d1a5b7d..8b0ccb5c5fc4 100644 --- a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c @@ -279,6 +279,7 @@ static int rtw_cfg80211_inform_bss(struct rtw_adapter *padapter, } bss = cfg80211_inform_bss(wiphy, notify_channel, + CFG80211_BSS_FTYPE_UNKNOWN, pnetwork->network.MacAddress, pnetwork->network.tsf, pnetwork->network.capability, diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c index 3727f6d25cf1..8942dcb44180 100644 --- a/drivers/staging/wlan-ng/cfg80211.c +++ b/drivers/staging/wlan-ng/cfg80211.c @@ -422,6 +422,7 @@ static int prism2_scan(struct wiphy *wiphy, IEEE80211_BAND_2GHZ); bss = cfg80211_inform_bss(wiphy, ieee80211_get_channel(wiphy, freq), + CFG80211_BSS_FTYPE_UNKNOWN, (const u8 *) &(msg2.bssid.data.data), msg2.timestamp.data, msg2.capinfo.data, msg2.beaconperiod.data, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 77b85a89abca..ab21299c8f4d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3767,11 +3767,25 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, } /** - * cfg80211_inform_bss - inform cfg80211 of a new BSS + * enum cfg80211_bss_frame_type - frame type that the BSS data came from + * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is + * from a beacon or probe response + * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon + * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response + */ +enum cfg80211_bss_frame_type { + CFG80211_BSS_FTYPE_UNKNOWN, + CFG80211_BSS_FTYPE_BEACON, + CFG80211_BSS_FTYPE_PRESP, +}; + +/** + * cfg80211_inform_bss_width - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS * @rx_channel: The channel the frame was received on * @scan_width: width of the control channel + * @ftype: frame type (if known) * @bssid: the BSSID of the BSS * @tsf: the TSF sent by the peer in the beacon/probe response (or 0) * @capability: the capability field sent by the peer @@ -3791,6 +3805,7 @@ struct cfg80211_bss * __must_check cfg80211_inform_bss_width(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp); @@ -3798,12 +3813,13 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp) { return cfg80211_inform_bss_width(wiphy, rx_channel, - NL80211_BSS_CHAN_WIDTH_20, + NL80211_BSS_CHAN_WIDTH_20, ftype, bssid, tsf, capability, beacon_interval, ie, ielen, signal, gfp); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ad1a1a2808d3..620a4b40d466 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -884,6 +884,7 @@ struct cfg80211_bss* cfg80211_inform_bss_width(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp) @@ -911,7 +912,7 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; /* - * Since we do not know here whether the IEs are from a Beacon or Probe + * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should @@ -926,7 +927,17 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, ies->from_beacon = false; memcpy(ies->data, ie, ielen); - rcu_assign_pointer(tmp.pub.beacon_ies, ies); + switch (ftype) { + case CFG80211_BSS_FTYPE_BEACON: + ies->from_beacon = true; + /* fall through to assign */ + case CFG80211_BSS_FTYPE_UNKNOWN: + rcu_assign_pointer(tmp.pub.beacon_ies, ies); + break; + case CFG80211_BSS_FTYPE_PRESP: + rcu_assign_pointer(tmp.pub.proberesp_ies, ies); + break; + } rcu_assign_pointer(tmp.pub.ies, ies); signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= -- cgit v1.2.3 From f111f780ae1abf4cdc464f24293be90c010a04f6 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Wed, 20 Aug 2014 22:03:18 +0400 Subject: netfilter: nfnetlink_acct: add filter support to nfacct counter list/reset You can use this to skip accounting objects when listing/resetting via NFNL_MSG_ACCT_GET/NFNL_MSG_ACCT_GET_CTRZERO messages with the NLM_F_DUMP netlink flag. The filtering covers the following cases: 1. No filter specified. In this case, the client will get old behaviour, 2. List/reset counter object only: In this case, you have to use NFACCT_F_QUOTA as mask and value 0. 3. List/reset quota objects only: You have to use NFACCT_F_QUOTA_PKTS as mask and value - the same, for byte based quota mask should be NFACCT_F_QUOTA_BYTES and value - the same. If you want to obtain the object with any quota type (ie. NFACCT_F_QUOTA_PKTS|NFACCT_F_QUOTA_BYTES), you need to perform two dump requests, one to obtain NFACCT_F_QUOTA_PKTS objects and another for NFACCT_F_QUOTA_BYTES. Signed-off-by: Alexey Perevalov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_acct.h | 8 ++++ net/netfilter/nfnetlink_acct.c | 54 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nfnetlink_acct.h b/include/uapi/linux/netfilter/nfnetlink_acct.h index 51404ec19022..f3e34dbbf966 100644 --- a/include/uapi/linux/netfilter/nfnetlink_acct.h +++ b/include/uapi/linux/netfilter/nfnetlink_acct.h @@ -28,9 +28,17 @@ enum nfnl_acct_type { NFACCT_USE, NFACCT_FLAGS, NFACCT_QUOTA, + NFACCT_FILTER, __NFACCT_MAX }; #define NFACCT_MAX (__NFACCT_MAX - 1) +enum nfnl_attr_filter_type { + NFACCT_FILTER_UNSPEC, + NFACCT_FILTER_MASK, + NFACCT_FILTER_VALUE, + __NFACCT_FILTER_MAX +}; +#define NFACCT_FILTER_MAX (__NFACCT_FILTER_MAX - 1) #endif /* _UAPI_NFNL_ACCT_H_ */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 3ea0eacbd970..c18af2f63eef 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -40,6 +40,11 @@ struct nf_acct { char data[0]; }; +struct nfacct_filter { + u32 value; + u32 mask; +}; + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ @@ -181,6 +186,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_acct *cur, *last; + const struct nfacct_filter *filter = cb->data; if (cb->args[2]) return 0; @@ -197,6 +203,10 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) last = NULL; } + + if (filter && (cur->flags & filter->mask) != filter->value) + continue; + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), @@ -211,6 +221,38 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int nfnl_acct_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { + [NFACCT_FILTER_MASK] = { .type = NLA_U32 }, + [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, +}; + +static struct nfacct_filter * +nfacct_filter_alloc(const struct nlattr * const attr) +{ + struct nfacct_filter *filter; + struct nlattr *tb[NFACCT_FILTER_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy); + if (err < 0) + return ERR_PTR(err); + + filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); + filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); + + return filter; +} + static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) @@ -222,7 +264,18 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, + .done = nfnl_acct_done, }; + + if (tb[NFACCT_FILTER]) { + struct nfacct_filter *filter; + + filter = nfacct_filter_alloc(tb[NFACCT_FILTER]); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + c.data = filter; + } return netlink_dump_start(nfnl, skb, nlh, &c); } @@ -314,6 +367,7 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_PKTS] = { .type = NLA_U64 }, [NFACCT_FLAGS] = { .type = NLA_U32 }, [NFACCT_QUOTA] = { .type = NLA_U64 }, + [NFACCT_FILTER] = {.type = NLA_NESTED }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { -- cgit v1.2.3 From 5c21403d74af2c9cd635a34c2f9199681a5b813e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 Aug 2014 14:39:04 -0700 Subject: net: Update sk_buff flag bit availability comment. We lost one when xmit_more was added. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b3802a197a8..b69b7b512c06 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -580,7 +580,7 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; - /* 2/4 bit hole (depending on ndisc_nodetype presence) */ + /* 1/3 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL -- cgit v1.2.3 From 3e8a72d1dae374cf6fc1dba97cec663585845ff9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:46 -0700 Subject: net: dsa: reduce number of protocol hooks DSA is currently registering one packet_type function per EtherType it needs to intercept in the receive path of a DSA-enabled Ethernet device. Right now we have three of them: trailer, DSA and eDSA, and there might be more in the future, this will not scale to the addition of new protocols. This patch proceeds with adding a new layer of abstraction and two new functions: dsa_switch_rcv() which will dispatch into the tag-protocol specific receive function implemented by net/dsa/tag_*.c dsa_slave_xmit() which will dispatch into the tag-protocol specific transmit function implemented by net/dsa/tag_*.c When we do create the per-port slave network devices, we iterate over the switch protocol to assign the DSA-specific receive and transmit operations. A new fake ethertype value is used: ETH_P_XDSA to illustrate the fact that this is no longer going to look like ETH_P_DSA or ETH_P_TRAILER like it used to be. This allows us to greatly simplify the check in eth_type_trans() and always override the skb->protocol with ETH_P_XDSA for Ethernet switches tagged protocol, while also reducing the number repetitive slave netdevice_ops assignments. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 28 ++++++++++++--------------- include/net/dsa.h | 20 +++---------------- include/uapi/linux/if_ether.h | 1 + net/dsa/dsa.c | 39 ++++++++++++++++++++----------------- net/dsa/dsa_priv.h | 9 +++------ net/dsa/slave.c | 45 ++++++++++++++----------------------------- net/dsa/tag_dsa.c | 8 ++++---- net/dsa/tag_edsa.c | 8 ++++---- net/dsa/tag_trailer.c | 8 ++++---- net/ethernet/eth.c | 7 ++----- 10 files changed, 68 insertions(+), 105 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 039b23786c22..1875dc71422a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1781,24 +1781,13 @@ void dev_net_set(struct net_device *dev, struct net *net) #endif } -static inline bool netdev_uses_dsa_tags(struct net_device *dev) +static inline bool netdev_uses_dsa(struct net_device *dev) { -#ifdef CONFIG_NET_DSA_TAG_DSA - if (dev->dsa_ptr != NULL) - return dsa_uses_dsa_tags(dev->dsa_ptr); -#endif - - return 0; -} - -static inline bool netdev_uses_trailer_tags(struct net_device *dev) -{ -#ifdef CONFIG_NET_DSA_TAG_TRAILER - if (dev->dsa_ptr != NULL) - return dsa_uses_trailer_tags(dev->dsa_ptr); +#ifdef CONFIG_NET_DSA + return dev->dsa_ptr != NULL; +#else + return false; #endif - - return 0; } /** @@ -1933,6 +1922,13 @@ struct udp_offload { struct offload_callbacks callbacks; }; +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; + + /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 6efce384451e..6e26f1e4d8ce 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -59,6 +59,8 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; +struct dsa_device_ops; + struct dsa_switch_tree { /* * Configuration data for the platform device that owns @@ -71,6 +73,7 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; + const struct dsa_device_ops *ops; __be16 tag_protocol; /* @@ -186,21 +189,4 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } -/* - * The original DSA tag format and some other tag formats have no - * ethertype, which means that we need to add a little hack to the - * networking receive path to make sure that received frames get - * the right ->protocol assigned to them when one of those tag - * formats is in use. - */ -static inline bool dsa_uses_dsa_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_DSA)); -} - -static inline bool dsa_uses_trailer_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); -} - #endif diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 0f8210b8e0bc..aa63ed023c2b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -128,6 +128,7 @@ #define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ #define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ #define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ +#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ /* * This is an Ethernet frame header. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0a49632fac47..92e71d2a2ccd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -608,6 +608,24 @@ static void dsa_shutdown(struct platform_device *pdev) { } +static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + + if (unlikely(dst == NULL)) { + kfree_skb(skb); + return 0; + } + + return dst->ops->rcv(skb, dev, pt, orig_dev); +} + +struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, +}; + static const struct of_device_id dsa_of_match_table[] = { { .compatible = "marvell,dsa", }, {} @@ -633,30 +651,15 @@ static int __init dsa_init_module(void) if (rc) return rc; -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_add_pack(&dsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_add_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_add_pack(&trailer_packet_type); -#endif + dev_add_pack(&dsa_pack_type); + return 0; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_remove_pack(&trailer_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_remove_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_remove_pack(&dsa_packet_type); -#endif + dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d4cf5cc747e3..218d75d16f6f 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -45,16 +45,13 @@ struct net_device *dsa_slave_create(struct dsa_switch *ds, int port, char *name); /* tag_dsa.c */ -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type dsa_packet_type; +extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type edsa_packet_type; +extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type trailer_packet_type; +extern const struct dsa_device_ops trailer_netdev_ops; #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 45a1e34c89e0..ad1a913533aa 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -171,6 +171,14 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } +static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch_tree *dst = p->parent->dst; + + return dst->ops->xmit(skb, dev); +} + /* ethtool operations *******************************************************/ static int @@ -293,42 +301,16 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_sset_count = dsa_slave_get_sset_count, }; -#ifdef CONFIG_NET_DSA_TAG_DSA -static const struct net_device_ops dsa_netdev_ops = { +static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, - .ndo_start_xmit = dsa_xmit, + .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_do_ioctl = dsa_slave_ioctl, }; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA -static const struct net_device_ops edsa_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = edsa_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER -static const struct net_device_ops trailer_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = trailer_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif /* slave device setup *******************************************************/ struct net_device * @@ -349,21 +331,22 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; + slave_dev->netdev_ops = &dsa_slave_netdev_ops; switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case htons(ETH_P_DSA): - slave_dev->netdev_ops = &dsa_netdev_ops; + ds->dst->ops = &dsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case htons(ETH_P_EDSA): - slave_dev->netdev_ops = &edsa_netdev_ops; + ds->dst->ops = &edsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case htons(ETH_P_TRAILER): - slave_dev->netdev_ops = &trailer_netdev_ops; + ds->dst->ops = &trailer_netdev_ops; break; #endif default: diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cacce1e22f9c..d7dbc5bda5c0 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -16,7 +16,7 @@ #define DSA_HLEN 4 -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; @@ -186,7 +186,7 @@ out: return 0; } -struct packet_type dsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_DSA), - .func = dsa_rcv, +const struct dsa_device_ops dsa_netdev_ops = { + .xmit = dsa_xmit, + .rcv = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index e70c43c25e64..6b30abe89183 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -17,7 +17,7 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; @@ -205,7 +205,7 @@ out: return 0; } -struct packet_type edsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_EDSA), - .func = edsa_rcv, +const struct dsa_device_ops edsa_netdev_ops = { + .xmit = edsa_xmit, + .rcv = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 94bc260d015d..5fe9444842c5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -14,7 +14,7 @@ #include #include "dsa_priv.h" -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; @@ -114,7 +114,7 @@ out: return 0; } -struct packet_type trailer_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TRAILER), - .func = trailer_rcv, +const struct dsa_device_ops trailer_netdev_ops = { + .xmit = trailer_xmit, + .rcv = trailer_rcv, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f405e0592407..5cebca134585 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -181,11 +181,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ - if (unlikely(netdev_uses_dsa_tags(dev))) - return htons(ETH_P_DSA); - - if (unlikely(netdev_uses_trailer_tags(dev))) - return htons(ETH_P_TRAILER); + if (unlikely(netdev_uses_dsa(dev))) + return htons(ETH_P_XDSA); if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) return eth->h_proto; -- cgit v1.2.3 From 464c3668f065baeacfffa9d421959d21069389fe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:48 -0700 Subject: net: phy: provide stub for fixed_phy_set_link_update In preparation for updating the DSA code and avoid using ifdefs there, provide an empty stub for fixed_phy_set_link_update when CONFIG_FIXED_PHY is not selected. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy_fixed.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index ae612acebb53..941138664c1d 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -18,6 +18,9 @@ extern int fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); extern void fixed_phy_del(int phy_addr); +extern int fixed_phy_set_link_update(struct phy_device *phydev, + int (*link_update)(struct net_device *, + struct fixed_phy_status *)); #else static inline int fixed_phy_add(unsigned int irq, int phy_id, struct fixed_phy_status *status) @@ -34,14 +37,12 @@ static inline int fixed_phy_del(int phy_addr) { return -ENODEV; } -#endif /* CONFIG_FIXED_PHY */ - -/* - * This function issued only by fixed_phy-aware drivers, no need - * protect it with #ifdef - */ -extern int fixed_phy_set_link_update(struct phy_device *phydev, +static inline int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, - struct fixed_phy_status *)); + struct fixed_phy_status *)) +{ + return -ENODEV; +} +#endif /* CONFIG_FIXED_PHY */ #endif /* __PHY_FIXED_H */ -- cgit v1.2.3 From fa981d9af82e08f316ed25ed43078f995cc4be0a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:49 -0700 Subject: net: dsa: provide a switch device device tree node pointer We might need to fetch additional resources from the device tree node pointer, such as register ranges or other properties. Keep a device_node pointer around for this. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ net/dsa/dsa.c | 1 + 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6e26f1e4d8ce..decc62709acd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -15,6 +15,7 @@ #include #include #include +#include #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -26,6 +27,12 @@ struct dsa_chip_data { struct device *mii_bus; int sw_addr; + /* Device tree node pointer for this specific switch chip + * used during switch setup in case additional properties + * and resources needs to be used + */ + struct device_node *of_node; + /* * The names of the switch's ports. Use "cpu" to * designate the switch port that the cpu is connected to, diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 92e71d2a2ccd..a28ef432d016 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -410,6 +410,7 @@ static int dsa_of_probe(struct platform_device *pdev) chip_index++; cd = &pd->chip[chip_index]; + cd->of_node = child; cd->mii_bus = &mdio_bus->dev; sw_addr = of_get_property(child, "reg", NULL); -- cgit v1.2.3 From bd47497a0171b96264927e3377254db13b9fe3e3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:50 -0700 Subject: net: dsa: retain a per-port device_node pointer We will later use the per-port device_node pointer to fetch a bunch of port-specific properties. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 2 ++ net/dsa/slave.c | 1 + 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index decc62709acd..597875d3f69e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -41,6 +41,7 @@ struct dsa_chip_data { * or any other string to indicate this is a physical port. */ char *port_names[DSA_MAX_PORTS]; + struct device_node *port_dn[DSA_MAX_PORTS]; /* * An array (with nr_chips elements) of which element [a] diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a28ef432d016..6a5bae673037 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -432,6 +432,8 @@ static int dsa_of_probe(struct platform_device *pdev) if (!port_name) continue; + cd->port_dn[port_index] = port; + cd->port_names[port_index] = kstrdup(port_name, GFP_KERNEL); if (!cd->port_names[port_index]) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ad1a913533aa..5688c34253e5 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -354,6 +354,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, } SET_NETDEV_DEV(slave_dev, parent); + slave_dev->dev.of_node = ds->pd->port_dn[port]; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); -- cgit v1.2.3 From 0d8bcdd383b8865e752a7e8edb4712c2e3902052 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:51 -0700 Subject: net: dsa: allow for more complex PHY setups Modify the DSA slave interface to be bound to an arbitray PHY, not just the ones that are available as child PHY devices of the switch MDIO bus. This allows us for instance to have external PHYs connected to a separate MDIO bus, but yet also connected to a given switch port. Under certain configurations, the physical port mask might not be a 1:1 mapping to the MII PHYs mask. This is the case, if e.g: Port 1 of the switch is used and connects to a PHY at a MDIO address different than 1. Introduce a phys_mii_mask variable which allows driver to implement and divert their own MDIO read/writes operations for a subset of the MDIO PHY addresses. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 5 ++++ net/dsa/dsa_priv.h | 4 +++ net/dsa/slave.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 83 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 597875d3f69e..dc357454ae3b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -130,6 +130,7 @@ struct dsa_switch { */ u32 dsa_port_mask; u32 phys_port_mask; + u32 phys_mii_mask; struct mii_bus *slave_mii_bus; struct net_device *ports[DSA_MAX_PORTS]; }; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6a5bae673037..4dc2a16b72cf 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -144,6 +144,11 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, goto out; } + /* Make the built-in MII bus mask match the number of ports, + * switch drivers can override this later + */ + ds->phys_mii_mask = ds->phys_port_mask; + /* * If the CPU connects to this switch, set the switch tree * tagging protocol to the preferred tagging format of this diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 218d75d16f6f..d20364ac1574 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -33,6 +33,10 @@ struct dsa_slave_priv { * to this port. */ struct phy_device *phy; + phy_interface_t phy_interface; + int old_link; + int old_pause; + int old_duplex; }; /* dsa.c */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5688c34253e5..03d2894a0f8a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -19,7 +21,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; - if (ds->phys_port_mask & (1 << addr)) + if (ds->phys_mii_mask & (1 << addr)) return ds->drv->phy_read(ds, addr, reg); return 0xffff; @@ -29,7 +31,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; - if (ds->phys_port_mask & (1 << addr)) + if (ds->phys_mii_mask & (1 << addr)) return ds->drv->phy_write(ds, addr, reg, val); return 0; @@ -312,7 +314,70 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; +static void dsa_slave_adjust_link(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + unsigned int status_changed = 0; + + if (p->old_link != p->phy->link) { + status_changed = 1; + p->old_link = p->phy->link; + } + + if (p->old_duplex != p->phy->duplex) { + status_changed = 1; + p->old_duplex = p->phy->duplex; + } + + if (p->old_pause != p->phy->pause) { + status_changed = 1; + p->old_pause = p->phy->pause; + } + + if (status_changed) + phy_print_status(p->phy); +} + /* slave device setup *******************************************************/ +static void dsa_slave_phy_setup(struct dsa_slave_priv *p, + struct net_device *slave_dev) +{ + struct dsa_switch *ds = p->parent; + struct dsa_chip_data *cd = ds->pd; + struct device_node *phy_dn, *port_dn; + int ret; + + port_dn = cd->port_dn[p->port]; + p->phy_interface = of_get_phy_mode(port_dn); + + phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); + if (of_phy_is_fixed_link(port_dn)) { + /* In the case of a fixed PHY, the DT node associated + * to the fixed PHY is the Port DT node + */ + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + pr_err("failed to register fixed PHY\n"); + return; + } + phy_dn = port_dn; + } + + if (phy_dn) + p->phy = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, 0, + p->phy_interface); + + /* We could not connect to a designated PHY, so use the switch internal + * MDIO bus instead + */ + if (!p->phy) + p->phy = ds->slave_mii_bus->phy_map[p->port]; + else + pr_info("attached PHY at address %d [%s]\n", + p->phy->addr, p->phy->drv->name); +} + struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) @@ -361,7 +426,12 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->dev = slave_dev; p->parent = ds; p->port = port; - p->phy = ds->slave_mii_bus->phy_map[port]; + + p->old_pause = -1; + p->old_link = -1; + p->old_duplex = -1; + + dsa_slave_phy_setup(p, slave_dev); ret = register_netdev(slave_dev); if (ret) { -- cgit v1.2.3 From 5aed85cec29882d1c4b4b2a01cb75a99efdbe4ed Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:52 -0700 Subject: net: dsa: allow switches to work without tagging In case switch port tagging is disabled (voluntarily, or the switch just does not support it), allow us to continue using the defined set of dsa_device_ops in net/dsa/slave.c. We introduce dsa_protocol_is_tagged() to check whether we need to override skb->protocol and go through the DSA-specifif packet_type function, or if we just go on and receive the SKB through the normal path. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- include/net/dsa.h | 5 +++++ net/dsa/slave.c | 19 ++++++++++++++++++- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1875dc71422a..429801370d0c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1784,10 +1784,10 @@ void dev_net_set(struct net_device *dev, struct net *net) static inline bool netdev_uses_dsa(struct net_device *dev) { #ifdef CONFIG_NET_DSA - return dev->dsa_ptr != NULL; -#else - return false; + if (dev->dsa_ptr != NULL) + return dsa_uses_tagged_protocol(dev->dsa_ptr); #endif + return false; } /** diff --git a/include/net/dsa.h b/include/net/dsa.h index dc357454ae3b..1035f6452d79 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -198,4 +198,9 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } +static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) +{ + return dst->tag_protocol != 0; +} + #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 03d2894a0f8a..241c2a1684cb 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -181,6 +181,17 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return dst->ops->xmit(skb, dev); } +static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; +} + /* ethtool operations *******************************************************/ static int @@ -314,6 +325,11 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; +static const struct dsa_device_ops notag_netdev_ops = { + .xmit = dsa_slave_notag_xmit, + .rcv = NULL, +}; + static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -415,7 +431,8 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, break; #endif default: - BUG(); + ds->dst->ops = ¬ag_netdev_ops; + break; } SET_NETDEV_DEV(slave_dev, parent); -- cgit v1.2.3 From ec9436baedb689668c409cfc8b69eb9573b0d661 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:53 -0700 Subject: net: dsa: allow drivers to do link adjustment Whenever libphy determines that the link status of a given PHY/port has changed, allow to call into the switch driver link adjustment callback so proper actions can be taken care of by the switch driver upon link notification. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ net/dsa/slave.c | 4 ++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1035f6452d79..2d3835924dd2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -16,6 +16,7 @@ #include #include #include +#include #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -181,6 +182,12 @@ struct dsa_switch_driver { */ void (*poll_link)(struct dsa_switch *ds); + /* + * Link state adjustment (called from libphy) + */ + void (*adjust_link)(struct dsa_switch *ds, int port, + struct phy_device *phydev); + /* * ethtool hardware statistics. */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 241c2a1684cb..398d0663d3dd 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -333,6 +333,7 @@ static const struct dsa_device_ops notag_netdev_ops = { static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; unsigned int status_changed = 0; if (p->old_link != p->phy->link) { @@ -350,6 +351,9 @@ static void dsa_slave_adjust_link(struct net_device *dev) p->old_pause = p->phy->pause; } + if (ds->drv->adjust_link && status_changed) + ds->drv->adjust_link(ds, p->port, p->phy); + if (status_changed) phy_print_status(p->phy); } -- cgit v1.2.3 From ce31b31c68e7e39f29b1257581fbd08ce3ca5589 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:54 -0700 Subject: net: dsa: allow updating fixed PHY link information Allow switch drivers to hook a PHY link update callback to perform port-specific link work. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/slave.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2d3835924dd2..2c9563f0b2f4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -17,6 +17,7 @@ #include #include #include +#include #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -187,6 +188,8 @@ struct dsa_switch_driver { */ void (*adjust_link)(struct dsa_switch *ds, int port, struct phy_device *phydev); + void (*fixed_link_update)(struct dsa_switch *ds, int port, + struct fixed_phy_status *st); /* * ethtool hardware statistics. diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 398d0663d3dd..18ff53836fe3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -358,6 +358,18 @@ static void dsa_slave_adjust_link(struct net_device *dev) phy_print_status(p->phy); } +static int dsa_slave_fixed_link_update(struct net_device *dev, + struct fixed_phy_status *status) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->fixed_link_update) + ds->drv->fixed_link_update(ds, p->port, status); + + return 0; +} + /* slave device setup *******************************************************/ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) @@ -365,6 +377,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, struct dsa_switch *ds = p->parent; struct dsa_chip_data *cd = ds->pd; struct device_node *phy_dn, *port_dn; + bool phy_is_fixed = false; int ret; port_dn = cd->port_dn[p->port]; @@ -380,6 +393,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, pr_err("failed to register fixed PHY\n"); return; } + phy_is_fixed = true; phy_dn = port_dn; } @@ -388,6 +402,9 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, dsa_slave_adjust_link, 0, p->phy_interface); + if (p->phy && phy_is_fixed) + fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ -- cgit v1.2.3 From 5037d532b83d7325a2743dffe82882a64697a8e8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:55 -0700 Subject: net: dsa: add Broadcom tag RX/TX handler Add support for the 4-bytes Broadcom tag that built-in switches such as the Starfighter 2 might insert when receiving packets, or that we need to insert while targetting specific switch ports. We use a fake local EtherType value for this 4-bytes switch tag: ETH_P_BRCMTAG to make sure we can assign DSA-specific network operations within the DSA drivers. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ++ net/dsa/Kconfig | 3 + net/dsa/Makefile | 1 + net/dsa/dsa_priv.h | 3 + net/dsa/slave.c | 5 ++ net/dsa/tag_brcm.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 190 insertions(+) create mode 100644 net/dsa/tag_brcm.c (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2c9563f0b2f4..97712927a9d2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -19,6 +19,11 @@ #include #include +/* Not an official ethertype value, used only internally for DSA + * demultiplexing + */ +#define ETH_P_BRCMTAG (ETH_P_XDSA + 1) + #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index f5eede1d6cb8..a585fd6352eb 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -12,6 +12,9 @@ config NET_DSA if NET_DSA # tagging formats +config NET_DSA_TAG_BRCM + bool + config NET_DSA_TAG_DSA bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 7b9fcbbeda5d..da06ed1df620 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += dsa.o slave.o # tagging formats +dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d20364ac1574..98afed4d92ba 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -57,5 +57,8 @@ extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ extern const struct dsa_device_ops trailer_netdev_ops; +/* tag_brcm.c */ +extern const struct dsa_device_ops brcm_netdev_ops; + #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 18ff53836fe3..7333a4aebb7d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -450,6 +450,11 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, case htons(ETH_P_TRAILER): ds->dst->ops = &trailer_netdev_ops; break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case htons(ETH_P_BRCMTAG): + ds->dst->ops = &brcm_netdev_ops; + break; #endif default: ds->dst->ops = ¬ag_netdev_ops; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c new file mode 100644 index 000000000000..e0b759ec209e --- /dev/null +++ b/net/dsa/tag_brcm.c @@ -0,0 +1,173 @@ +/* + * Broadcom tag support + * + * Copyright (C) 2014 Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +/* This tag length is 4 bytes, older ones were 6 bytes, we do not + * handle them + */ +#define BRCM_TAG_LEN 4 + +/* Tag is constructed and desconstructed using byte by byte access + * because the tag is placed after the MAC Source Address, which does + * not make it 4-bytes aligned, so this might cause unaligned accesses + * on most systems where this is used. + */ + +/* Ingress and egress opcodes */ +#define BRCM_OPCODE_SHIFT 5 +#define BRCM_OPCODE_MASK 0x7 + +/* Ingress fields */ +/* 1st byte in the tag */ +#define BRCM_IG_TC_SHIFT 2 +#define BRCM_IG_TC_MASK 0x7 +/* 2nd byte in the tag */ +#define BRCM_IG_TE_MASK 0x3 +#define BRCM_IG_TS_SHIFT 7 +/* 3rd byte in the tag */ +#define BRCM_IG_DSTMAP2_MASK 1 +#define BRCM_IG_DSTMAP1_MASK 0xff + +/* Egress fields */ + +/* 2nd byte in the tag */ +#define BRCM_EG_CID_MASK 0xff + +/* 3rd byte in the tag */ +#define BRCM_EG_RC_MASK 0xff +#define BRCM_EG_RC_RSVD (3 << 6) +#define BRCM_EG_RC_EXCEPTION (1 << 5) +#define BRCM_EG_RC_PROT_SNOOP (1 << 4) +#define BRCM_EG_RC_PROT_TERM (1 << 3) +#define BRCM_EG_RC_SWITCH (1 << 2) +#define BRCM_EG_RC_MAC_LEARN (1 << 1) +#define BRCM_EG_RC_MIRROR (1 << 0) +#define BRCM_EG_TC_SHIFT 5 +#define BRCM_EG_TC_MASK 0x7 +#define BRCM_EG_PID_MASK 0x1f + +static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u8 *brcm_tag; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) + goto out_free; + + skb_push(skb, BRCM_TAG_LEN); + + memmove(skb->data, skb->data + BRCM_TAG_LEN, 2 * ETH_ALEN); + + /* Build the tag after the MAC Source Address */ + brcm_tag = skb->data + 2 * ETH_ALEN; + + /* Set the ingress opcode, traffic class, tag enforcment is + * deprecated + */ + brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) | + ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK); + brcm_tag[1] = 0; + brcm_tag[2] = 0; + if (p->port == 8) + brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; + brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; + + /* Queue the SKB for transmission on the parent interface, but + * do not modify its EtherType + */ + skb->protocol = htons(ETH_P_BRCMTAG); + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; + +out_free: + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + int source_port; + u8 *brcm_tag; + + if (unlikely(dst == NULL)) + goto out_drop; + + ds = dst->ds[0]; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb == NULL) + goto out; + + if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) + goto out_drop; + + /* skb->data points to the EtherType, the tag is right before it */ + brcm_tag = skb->data - 2; + + /* The opcode should never be different than 0b000 */ + if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) + goto out_drop; + + /* We should never see a reserved reason code without knowing how to + * handle it + */ + WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD); + + /* Locate which port this is coming from */ + source_port = brcm_tag[3] & BRCM_EG_PID_MASK; + + /* Validate port against switch setup, either the port is totally */ + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + goto out_drop; + + /* Remove Broadcom tag and update checksum */ + skb_pull_rcsum(skb, BRCM_TAG_LEN); + + /* Move the Ethernet DA and SA */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - BRCM_TAG_LEN, + 2 * ETH_ALEN); + + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->dev = ds->ports[source_port]; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +const struct dsa_device_ops brcm_netdev_ops = { + .xmit = brcm_tag_xmit, + .rcv = brcm_tag_rcv, +}; -- cgit v1.2.3 From 97fdaab4699de3a2a91001efef60bb0622de1c53 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 Aug 2014 13:15:25 -0700 Subject: net: phy: broadcom: fix PHY_BCM_OUI_4 PHY_BCM_OUI_4 is missing two significant digits that actually make it an OUI, add those missing bits so it becomes usable again for matching. Fixes: b560a58c45c6 ("net: phy: add Broadcom BCM7xxx internal PHY driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index ee1431d976fa..921f17ca4c26 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -21,7 +21,7 @@ #define PHY_BCM_OUI_1 0x00206000 #define PHY_BCM_OUI_2 0x0143bc00 #define PHY_BCM_OUI_3 0x03625c00 -#define PHY_BCM_OUI_4 0x600d0000 +#define PHY_BCM_OUI_4 0x600d8400 #define PHY_BCM_OUI_5 0x03625e00 -- cgit v1.2.3 From 11bf2bbd596add62a86a74fc7aedc0b86c6ec154 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 Aug 2014 13:15:26 -0700 Subject: net: phy: broadcom: add new Broadcom OUI Broadcom started to use a new OUI for its 2013 and newer products: D4-01-29 which translates into 0xae025000 for a 32-bits OUI, add its definition. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 921f17ca4c26..cbcfad36d4c0 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -23,7 +23,7 @@ #define PHY_BCM_OUI_3 0x03625c00 #define PHY_BCM_OUI_4 0x600d8400 #define PHY_BCM_OUI_5 0x03625e00 - +#define PHY_BCM_OUI_6 0xae025000 #define PHY_BCM_FLAGS_MODE_COPPER 0x00000001 #define PHY_BCM_FLAGS_MODE_1000BX 0x00000002 -- cgit v1.2.3 From 430ad68ffb5fa632a277162e5995cd6f7a39fb78 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 Aug 2014 13:15:27 -0700 Subject: net: phy: bcm7xxx: add BCM7250 and BCM7364 PHY entries Add two new entries to the Broadcom BCM7xxx internal PHY driver for BCM7250 and BCM7364 chips. Those chips share the usual 28nm process Gigabit PHY sequence and require the same workarounds so far. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 4 ++++ include/linux/brcmphy.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 948c7086679a..09dd6e1dc6e1 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -335,6 +335,8 @@ static int bcm7xxx_dummy_config_init(struct phy_device *phydev) } static struct phy_driver bcm7xxx_driver[] = { + BCM7XXX_28NM_GPHY(PHY_ID_BCM7250, "Broadcom BCM7250"), + BCM7XXX_28NM_GPHY(PHY_ID_BCM7364, "Broadcom BCM7364"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7366, "Broadcom BCM7366"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"), @@ -367,6 +369,8 @@ static struct phy_driver bcm7xxx_driver[] = { } }; static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = { + { PHY_ID_BCM7250, 0xfffffff0, }, + { PHY_ID_BCM7364, 0xfffffff0, }, { PHY_ID_BCM7366, 0xfffffff0, }, { PHY_ID_BCM7439, 0xfffffff0, }, { PHY_ID_BCM7445, 0xfffffff0, }, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index cbcfad36d4c0..5bd35cc0d471 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -13,6 +13,8 @@ #define PHY_ID_BCM5461 0x002060c0 #define PHY_ID_BCM57780 0x03625d90 +#define PHY_ID_BCM7250 0xae025280 +#define PHY_ID_BCM7364 0xae025260 #define PHY_ID_BCM7366 0x600d8490 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3 From 10c51b56232d24f150e39884a9e749fd99cbc60c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 27 Aug 2014 11:11:27 +0200 Subject: net: add skb_get_tx_queue() helper Replace occurences of skb_get_queue_mapping() and follow-up netdev_get_tx_queue() with an actual helper function. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ net/core/netpoll.c | 2 +- net/core/pktgen.c | 4 +--- net/packet/af_packet.c | 4 +--- net/sched/sch_generic.c | 6 ++++-- 5 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 429801370d0c..dfc1d8b8bd0f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1747,6 +1747,12 @@ struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, return &dev->_tx[index]; } +static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev, + const struct sk_buff *skb) +{ + return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); +} + static inline void netdev_for_each_tx_queue(struct net_device *dev, void (*f)(struct net_device *, struct netdev_queue *, diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a5ad06828d67..12b1df976562 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -115,7 +115,7 @@ static void queue_process(struct work_struct *work) continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); local_irq_save(flags); HARD_TX_LOCK(dev, txq, smp_processor_id()); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 83e2b4b19eb7..d81b540096c3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3286,7 +3286,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; - u16 queue_map; int ret; /* If device is offline, then don't send */ @@ -3324,8 +3323,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0dfa990d4eaa..b7a7f5a721bd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -243,7 +243,6 @@ static int packet_direct_xmit(struct sk_buff *skb) netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; - u16 queue_map; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) @@ -254,8 +253,7 @@ static int packet_direct_xmit(struct sk_buff *skb) __skb_linearize(skb)) goto drop; - queue_map = skb_get_queue_mapping(skb); - txq = netdev_get_tx_queue(dev, queue_map); + txq = skb_get_tx_queue(dev, skb); local_bh_disable(); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc04fe93c2da..05b3f5d104af 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -63,7 +63,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; @@ -183,10 +183,12 @@ static inline int qdisc_restart(struct Qdisc *q) skb = dequeue_skb(q); if (unlikely(!skb)) return 0; + WARN_ON_ONCE(skb_dst_is_noref(skb)); + root_lock = qdisc_lock(q); dev = qdisc_dev(q); - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock); } -- cgit v1.2.3 From 18fe8db5f2b53e4ac67b47048f24f50c57a2a759 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:31 +0200 Subject: include/linux/cycx_x25.h: Remove unused header The header file include/linux/cycx_x25.h does not seem to be used anywhere. It was orphaned by 6fcdf4facb "wanrouter: delete now orphaned header content, files/drivers". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/cycx_x25.h | 125 ----------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 include/linux/cycx_x25.h (limited to 'include') diff --git a/include/linux/cycx_x25.h b/include/linux/cycx_x25.h deleted file mode 100644 index 362bf19d6cf1..000000000000 --- a/include/linux/cycx_x25.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef _CYCX_X25_H -#define _CYCX_X25_H -/* -* cycx_x25.h Cyclom X.25 firmware API definitions. -* -* Author: Arnaldo Carvalho de Melo -* -* Copyright: (c) 1998-2003 Arnaldo Carvalho de Melo -* -* Based on sdla_x25.h by Gene Kozin <74604.152@compuserve.com> -* -* This program is free software; you can redistribute it and/or -* modify it under the terms of the GNU General Public License -* as published by the Free Software Foundation; either version -* 2 of the License, or (at your option) any later version. -* ============================================================================ -* 2000/04/02 acme dprintk and cycx_debug -* 1999/01/03 acme judicious use of data types -* 1999/01/02 acme #define X25_ACK_N3 0x4411 -* 1998/12/28 acme cleanup: lot'o'things removed -* commands listed, -* TX25Cmd & TX25Config structs -* typedef'ed -*/ -#ifndef PACKED -#define PACKED __attribute__((packed)) -#endif - -/* X.25 shared memory layout. */ -#define X25_MBOX_OFFS 0x300 /* general mailbox block */ -#define X25_RXMBOX_OFFS 0x340 /* receive mailbox */ - -/* Debug */ -#define dprintk(level, format, a...) if (cycx_debug >= level) printk(format, ##a) - -extern unsigned int cycx_debug; - -/* Data Structures */ -/* X.25 Command Block. */ -struct cycx_x25_cmd { - u16 command; - u16 link; /* values: 0 or 1 */ - u16 len; /* values: 0 thru 0x205 (517) */ - u32 buf; -} PACKED; - -/* Defines for the 'command' field. */ -#define X25_CONNECT_REQUEST 0x4401 -#define X25_CONNECT_RESPONSE 0x4402 -#define X25_DISCONNECT_REQUEST 0x4403 -#define X25_DISCONNECT_RESPONSE 0x4404 -#define X25_DATA_REQUEST 0x4405 -#define X25_ACK_TO_VC 0x4406 -#define X25_INTERRUPT_RESPONSE 0x4407 -#define X25_CONFIG 0x4408 -#define X25_CONNECT_INDICATION 0x4409 -#define X25_CONNECT_CONFIRM 0x440A -#define X25_DISCONNECT_INDICATION 0x440B -#define X25_DISCONNECT_CONFIRM 0x440C -#define X25_DATA_INDICATION 0x440E -#define X25_INTERRUPT_INDICATION 0x440F -#define X25_ACK_FROM_VC 0x4410 -#define X25_ACK_N3 0x4411 -#define X25_CONNECT_COLLISION 0x4413 -#define X25_N3WIN 0x4414 -#define X25_LINE_ON 0x4415 -#define X25_LINE_OFF 0x4416 -#define X25_RESET_REQUEST 0x4417 -#define X25_LOG 0x4500 -#define X25_STATISTIC 0x4600 -#define X25_TRACE 0x4700 -#define X25_N2TRACEXC 0x4702 -#define X25_N3TRACEXC 0x4703 - -/** - * struct cycx_x25_config - cyclom2x x25 firmware configuration - * @link - link number - * @speed - line speed - * @clock - internal/external - * @n2 - # of level 2 retransm.(values: 1 thru FF) - * @n2win - level 2 window (values: 1 thru 7) - * @n3win - level 3 window (values: 1 thru 7) - * @nvc - # of logical channels (values: 1 thru 64) - * @pktlen - level 3 packet length - log base 2 of size - * @locaddr - my address - * @remaddr - remote address - * @t1 - time, in seconds - * @t2 - time, in seconds - * @t21 - time, in seconds - * @npvc - # of permanent virt. circuits (1 thru nvc) - * @t23 - time, in seconds - * @flags - see dosx25.doc, in portuguese, for details - */ -struct cycx_x25_config { - u8 link; - u8 speed; - u8 clock; - u8 n2; - u8 n2win; - u8 n3win; - u8 nvc; - u8 pktlen; - u8 locaddr; - u8 remaddr; - u16 t1; - u16 t2; - u8 t21; - u8 npvc; - u8 t23; - u8 flags; -} PACKED; - -struct cycx_x25_stats { - u16 rx_crc_errors; - u16 rx_over_errors; - u16 n2_tx_frames; - u16 n2_rx_frames; - u16 tx_timeouts; - u16 rx_timeouts; - u16 n3_tx_packets; - u16 n3_rx_packets; - u16 tx_aborts; - u16 rx_aborts; -} PACKED; -#endif /* _CYCX_X25_H */ -- cgit v1.2.3 From fbd74659d4513816a6249b0db491e8d831803520 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:32 +0200 Subject: include/linux/i82593.h: Remove unused header The header file include/linux/i82593.h does not seem to be used anywhere. It was orphaned by 8a594170 "drivers/net: delete intel i825xx based znet notebook driver". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/i82593.h | 229 ------------------------------------------------- 1 file changed, 229 deletions(-) delete mode 100644 include/linux/i82593.h (limited to 'include') diff --git a/include/linux/i82593.h b/include/linux/i82593.h deleted file mode 100644 index afac5c7a323d..000000000000 --- a/include/linux/i82593.h +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Definitions for Intel 82593 CSMA/CD Core LAN Controller - * The definitions are taken from the 1992 users manual with Intel - * order number 297125-001. - * - * /usr/src/pc/RCS/i82593.h,v 1.1 1996/07/17 15:23:12 root Exp - * - * Copyright 1994, Anders Klemets - * - * HISTORY - * i82593.h,v - * Revision 1.4 2005/11/4 09:15:00 baroniunas - * Modified copyright with permission of author as follows: - * - * "If I82539.H is the only file with my copyright statement - * that is included in the Source Forge project, then you have - * my approval to change the copyright statement to be a GPL - * license, in the way you proposed on October 10." - * - * Revision 1.1 1996/07/17 15:23:12 root - * Initial revision - * - * Revision 1.3 1995/04/05 15:13:58 adj - * Initial alpha release - * - * Revision 1.2 1994/06/16 23:57:31 klemets - * Mirrored all the fields in the configuration block. - * - * Revision 1.1 1994/06/02 20:25:34 klemets - * Initial revision - * - * - */ -#ifndef _I82593_H -#define _I82593_H - -/* Intel 82593 CSMA/CD Core LAN Controller */ - -/* Port 0 Command Register definitions */ - -/* Execution operations */ -#define OP0_NOP 0 /* CHNL = 0 */ -#define OP0_SWIT_TO_PORT_1 0 /* CHNL = 1 */ -#define OP0_IA_SETUP 1 -#define OP0_CONFIGURE 2 -#define OP0_MC_SETUP 3 -#define OP0_TRANSMIT 4 -#define OP0_TDR 5 -#define OP0_DUMP 6 -#define OP0_DIAGNOSE 7 -#define OP0_TRANSMIT_NO_CRC 9 -#define OP0_RETRANSMIT 12 -#define OP0_ABORT 13 -/* Reception operations */ -#define OP0_RCV_ENABLE 8 -#define OP0_RCV_DISABLE 10 -#define OP0_STOP_RCV 11 -/* Status pointer control operations */ -#define OP0_FIX_PTR 15 /* CHNL = 1 */ -#define OP0_RLS_PTR 15 /* CHNL = 0 */ -#define OP0_RESET 14 - -#define CR0_CHNL (1 << 4) /* 0=Channel 0, 1=Channel 1 */ -#define CR0_STATUS_0 0x00 -#define CR0_STATUS_1 0x20 -#define CR0_STATUS_2 0x40 -#define CR0_STATUS_3 0x60 -#define CR0_INT_ACK (1 << 7) /* 0=No ack, 1=acknowledge */ - -/* Port 0 Status Register definitions */ - -#define SR0_NO_RESULT 0 /* dummy */ -#define SR0_EVENT_MASK 0x0f -#define SR0_IA_SETUP_DONE 1 -#define SR0_CONFIGURE_DONE 2 -#define SR0_MC_SETUP_DONE 3 -#define SR0_TRANSMIT_DONE 4 -#define SR0_TDR_DONE 5 -#define SR0_DUMP_DONE 6 -#define SR0_DIAGNOSE_PASSED 7 -#define SR0_TRANSMIT_NO_CRC_DONE 9 -#define SR0_RETRANSMIT_DONE 12 -#define SR0_EXECUTION_ABORTED 13 -#define SR0_END_OF_FRAME 8 -#define SR0_RECEPTION_ABORTED 10 -#define SR0_DIAGNOSE_FAILED 15 -#define SR0_STOP_REG_HIT 11 - -#define SR0_CHNL (1 << 4) -#define SR0_EXECUTION (1 << 5) -#define SR0_RECEPTION (1 << 6) -#define SR0_INTERRUPT (1 << 7) -#define SR0_BOTH_RX_TX (SR0_EXECUTION | SR0_RECEPTION) - -#define SR3_EXEC_STATE_MASK 0x03 -#define SR3_EXEC_IDLE 0 -#define SR3_TX_ABORT_IN_PROGRESS 1 -#define SR3_EXEC_ACTIVE 2 -#define SR3_ABORT_IN_PROGRESS 3 -#define SR3_EXEC_CHNL (1 << 2) -#define SR3_STP_ON_NO_RSRC (1 << 3) -#define SR3_RCVING_NO_RSRC (1 << 4) -#define SR3_RCV_STATE_MASK 0x60 -#define SR3_RCV_IDLE 0x00 -#define SR3_RCV_READY 0x20 -#define SR3_RCV_ACTIVE 0x40 -#define SR3_RCV_STOP_IN_PROG 0x60 -#define SR3_RCV_CHNL (1 << 7) - -/* Port 1 Command Register definitions */ - -#define OP1_NOP 0 -#define OP1_SWIT_TO_PORT_0 1 -#define OP1_INT_DISABLE 2 -#define OP1_INT_ENABLE 3 -#define OP1_SET_TS 5 -#define OP1_RST_TS 7 -#define OP1_POWER_DOWN 8 -#define OP1_RESET_RING_MNGMT 11 -#define OP1_RESET 14 -#define OP1_SEL_RST 15 - -#define CR1_STATUS_4 0x00 -#define CR1_STATUS_5 0x20 -#define CR1_STATUS_6 0x40 -#define CR1_STOP_REG_UPDATE (1 << 7) - -/* Receive frame status bits */ - -#define RX_RCLD (1 << 0) -#define RX_IA_MATCH (1 << 1) -#define RX_NO_AD_MATCH (1 << 2) -#define RX_NO_SFD (1 << 3) -#define RX_SRT_FRM (1 << 7) -#define RX_OVRRUN (1 << 8) -#define RX_ALG_ERR (1 << 10) -#define RX_CRC_ERR (1 << 11) -#define RX_LEN_ERR (1 << 12) -#define RX_RCV_OK (1 << 13) -#define RX_TYP_LEN (1 << 15) - -/* Transmit status bits */ - -#define TX_NCOL_MASK 0x0f -#define TX_FRTL (1 << 4) -#define TX_MAX_COL (1 << 5) -#define TX_HRT_BEAT (1 << 6) -#define TX_DEFER (1 << 7) -#define TX_UND_RUN (1 << 8) -#define TX_LOST_CTS (1 << 9) -#define TX_LOST_CRS (1 << 10) -#define TX_LTCOL (1 << 11) -#define TX_OK (1 << 13) -#define TX_COLL (1 << 15) - -struct i82593_conf_block { - u_char fifo_limit : 4, - forgnesi : 1, - fifo_32 : 1, - d6mod : 1, - throttle_enb : 1; - u_char throttle : 6, - cntrxint : 1, - contin : 1; - u_char addr_len : 3, - acloc : 1, - preamb_len : 2, - loopback : 2; - u_char lin_prio : 3, - tbofstop : 1, - exp_prio : 3, - bof_met : 1; - u_char : 4, - ifrm_spc : 4; - u_char : 5, - slottim_low : 3; - u_char slottim_hi : 3, - : 1, - max_retr : 4; - u_char prmisc : 1, - bc_dis : 1, - : 1, - crs_1 : 1, - nocrc_ins : 1, - crc_1632 : 1, - : 1, - crs_cdt : 1; - u_char cs_filter : 3, - crs_src : 1, - cd_filter : 3, - : 1; - u_char : 2, - min_fr_len : 6; - u_char lng_typ : 1, - lng_fld : 1, - rxcrc_xf : 1, - artx : 1, - sarec : 1, - tx_jabber : 1, /* why is this called max_len in the manual? */ - hash_1 : 1, - lbpkpol : 1; - u_char : 6, - fdx : 1, - : 1; - u_char dummy_6 : 6, /* supposed to be ones */ - mult_ia : 1, - dis_bof : 1; - u_char dummy_1 : 1, /* supposed to be one */ - tx_ifs_retrig : 2, - mc_all : 1, - rcv_mon : 2, - frag_acpt : 1, - tstrttrs : 1; - u_char fretx : 1, - runt_eop : 1, - hw_sw_pin : 1, - big_endn : 1, - syncrqs : 1, - sttlen : 1, - tx_eop : 1, - rx_eop : 1; - u_char rbuf_size : 5, - rcvstop : 1, - : 2; -}; - -#define I82593_MAX_MULTICAST_ADDRESSES 128 /* Hardware hashed filter */ - -#endif /* _I82593_H */ -- cgit v1.2.3 From 6fb7c3778f0fba0bad099c30e834c413c4f8bcb5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:33 +0200 Subject: include/linux/phonedev.h: Remove unused header The header file include/linux/phonedev.h does not seem to be used anywhere. It was orphaned by 7326446c "Staging: remove telephony drivers". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/phonedev.h | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 include/linux/phonedev.h (limited to 'include') diff --git a/include/linux/phonedev.h b/include/linux/phonedev.h deleted file mode 100644 index 4269de99e320..000000000000 --- a/include/linux/phonedev.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __LINUX_PHONEDEV_H -#define __LINUX_PHONEDEV_H - -#include - -#ifdef __KERNEL__ - -#include - -struct phone_device { - struct phone_device *next; - const struct file_operations *f_op; - int (*open) (struct phone_device *, struct file *); - int board; /* Device private index */ - int minor; -}; - -extern int phonedev_init(void); -#define PHONE_MAJOR 100 -extern int phone_register_device(struct phone_device *, int unit); -#define PHONE_UNIT_ANY -1 -extern void phone_unregister_device(struct phone_device *); - -#endif -#endif -- cgit v1.2.3 From 918bbc4ffdb84e9d2696315e427a6c43de65bc01 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:34 +0200 Subject: include/rxrpc/types.h: Remove unused header The header file include/rxrpc/types.h does not seem to be used anywhere. It was orphaned by 63b6be55 "[AF_RXRPC]: Delete the old RxRPC code.". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/rxrpc/types.h | 41 ----------------------------------------- 1 file changed, 41 deletions(-) delete mode 100644 include/rxrpc/types.h (limited to 'include') diff --git a/include/rxrpc/types.h b/include/rxrpc/types.h deleted file mode 100644 index 30d48f6da228..000000000000 --- a/include/rxrpc/types.h +++ /dev/null @@ -1,41 +0,0 @@ -/* types.h: Rx types - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _LINUX_RXRPC_TYPES_H -#define _LINUX_RXRPC_TYPES_H - -#include -#include -#include -#include -#include -#include - -typedef uint32_t rxrpc_seq_t; /* Rx message sequence number */ -typedef uint32_t rxrpc_serial_t; /* Rx message serial number */ -typedef __be32 rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */ -typedef __be32 rxrpc_serial_net_t; /* on-the-wire Rx message serial number */ - -struct rxrpc_call; -struct rxrpc_connection; -struct rxrpc_header; -struct rxrpc_message; -struct rxrpc_operation; -struct rxrpc_peer; -struct rxrpc_service; -typedef struct rxrpc_timer rxrpc_timer_t; -struct rxrpc_transport; - -typedef void (*rxrpc_call_attn_func_t)(struct rxrpc_call *call); -typedef void (*rxrpc_call_error_func_t)(struct rxrpc_call *call); -typedef void (*rxrpc_call_aemap_func_t)(struct rxrpc_call *call); - -#endif /* _LINUX_RXRPC_TYPES_H */ -- cgit v1.2.3 From de20fe8e2cc3c4ca13fdb529e6720d9d199333fe Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:35 -0700 Subject: net: Allocate a new 16 bits for flags in skbuff Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b69b7b512c06..3c9574c80933 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -598,6 +598,10 @@ struct sk_buff { __u32 reserved_tailroom; }; + kmemcheck_bitfield_begin(flags3); + /* 16 bit hole */ + kmemcheck_bitfield_end(flags3); + __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; -- cgit v1.2.3 From 77cffe23c1f88835f6bd7b47bfa0c060c2969828 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:46 -0700 Subject: net: Clarification of CHECKSUM_UNNECESSARY This patch: - Clarifies the specific requirements of devices returning CHECKSUM_UNNECESSARY (comments in skbuff.h). - Adds csum_level field to skbuff. This is used to express how many checksums are covered by CHECKSUM_UNNECESSARY (stores n - 1). This replaces the overloading of skb->encapsulation, that field is is now only used to indicate inner headers are valid. - Change __skb_checksum_validate_needed to "consume" each checksum as indicated by csum_level as layers of the the packet are parsed. - Remove skb_pop_rcv_encapsulation, no longer needed in the new csum_level model. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 -- include/linux/skbuff.h | 74 ++++++++++++++++++++++++++++++++++---------------- net/ipv4/gre_demux.c | 1 - 3 files changed, 51 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index beb377b2d4b7..67527f3d3be2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1158,8 +1158,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - skb_pop_rcv_encapsulation(skb); - vs->rcv(vs, skb, vxh->vx_vni); return 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3c9574c80933..c93b5859a772 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -47,11 +47,29 @@ * * The hardware you're dealing with doesn't calculate the full checksum * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums - * for specific protocols e.g. TCP/UDP/SCTP, then, for such packets it will - * set CHECKSUM_UNNECESSARY if their checksums are okay. skb->csum is still - * undefined in this case though. It is a bad option, but, unfortunately, - * nowadays most vendors do this. Apparently with the secret goal to sell - * you new devices, when you will add new protocol to your host, f.e. IPv6 8) + * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY + * if their checksums are okay. skb->csum is still undefined in this case + * though. It is a bad option, but, unfortunately, nowadays most vendors do + * this. Apparently with the secret goal to sell you new devices, when you + * will add new protocol to your host, f.e. IPv6 8) + * + * CHECKSUM_UNNECESSARY is applicable to following protocols: + * TCP: IPv6 and IPv4. + * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a + * zero UDP checksum for either IPv4 or IPv6, the networking stack + * may perform further validation in this case. + * GRE: only if the checksum is present in the header. + * SCTP: indicates the CRC in SCTP header has been validated. + * + * skb->csum_level indicates the number of consecutive checksums found in + * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. + * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet + * and a device is able to verify the checksums for UDP (possibly zero), + * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to + * two. If the device were only able to verify the UDP checksum and not + * GRE, either because it doesn't support GRE checksum of because GRE + * checksum is bad, skb->csum_level would be set to zero (TCP checksum is + * not considered in this case). * * CHECKSUM_COMPLETE: * @@ -112,6 +130,9 @@ #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 +/* Maximum value in skb->csum_level */ +#define SKB_MAX_CSUM_LEVEL 3 + #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) @@ -571,11 +592,7 @@ struct sk_buff { __u8 wifi_acked:1; __u8 no_fcs:1; __u8 head_frag:1; - /* Encapsulation protocol and NIC drivers should use - * this flag to indicate to each other if the skb contains - * encapsulated packet or not and maybe use the inner packet - * headers if needed - */ + /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; @@ -599,7 +616,8 @@ struct sk_buff { }; kmemcheck_bitfield_begin(flags3); - /* 16 bit hole */ + __u8 csum_level:2; + /* 14 bit hole */ kmemcheck_bitfield_end(flags3); __be16 inner_protocol; @@ -1866,18 +1884,6 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) return pskb_may_pull(skb, skb_network_offset(skb) + len); } -static inline void skb_pop_rcv_encapsulation(struct sk_buff *skb) -{ - /* Only continue with checksum unnecessary if device indicated - * it is valid across encapsulation (skb->encapsulation was set). - */ - if (skb->ip_summed == CHECKSUM_UNNECESSARY && !skb->encapsulation) - skb->ip_summed = CHECKSUM_NONE; - - skb->encapsulation = 0; - skb->csum_valid = 0; -} - /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the @@ -2798,6 +2804,27 @@ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) 0 : __skb_checksum_complete(skb); } +static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level == 0) + skb->ip_summed = CHECKSUM_NONE; + else + skb->csum_level--; + } +} + +static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else if (skb->ip_summed == CHECKSUM_NONE) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -2809,6 +2836,7 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; + __skb_decr_checksum_unnecessary(skb); return false; } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7c1a8ff974dd..0485bf7f8f03 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -125,7 +125,6 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, *csum_err = true; return -EINVAL; } - skb_pop_rcv_encapsulation(skb); options++; } -- cgit v1.2.3 From 662880f4420340aad4f9a62a349c6c9d4faa1a5d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:56 -0700 Subject: net: Allow GRO to use and set levels of checksum unnecessary Allow GRO path to "consume" checksums provided in CHECKSUM_UNNECESSARY and to report new checksums verfied for use in fallback to normal path. Change GRO checksum path to track csum_level using a csum_cnt field in NAPI_GRO_CB. On GRO initialization, if ip_summed is CHECKSUM_UNNECESSARY set NAPI_GRO_CB(skb)->csum_cnt to skb->csum_level + 1. For each checksum verified, decrement NAPI_GRO_CB(skb)->csum_cnt while its greater than zero. If a checksum is verfied and NAPI_GRO_CB(skb)->csum_cnt == 0, we have verified a deeper checksum than originally indicated in skbuf so increment csum_level (or initialize to CHECKSUM_UNNECESSARY if ip_summed is CHECKSUM_NONE or CHECKSUM_COMPLETE). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 26 ++++++++++++-------------- net/core/dev.c | 24 ++++++++++++++++-------- net/ipv4/gre_offload.c | 7 ++----- net/ipv4/udp_offload.c | 5 +++-- 4 files changed, 33 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dfc1d8b8bd0f..456eb1fe51e8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1883,8 +1883,8 @@ struct napi_gro_cb { /* GRO checksum is valid */ u8 csum_valid:1; - /* Number encapsulation layers crossed */ - u8 encapsulation; + /* Number of checksums via CHECKSUM_UNNECESSARY */ + u8 csum_cnt:3; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2179,8 +2179,7 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, __sum16 check) { return (skb->ip_summed != CHECKSUM_PARTIAL && - (skb->ip_summed != CHECKSUM_UNNECESSARY || - (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) && + NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } @@ -2196,18 +2195,17 @@ static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, return __skb_gro_checksum_complete(skb); } -/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level - * checksum or an encapsulated one during GRO. This saves work - * if we fallback to normal path with the packet. - */ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (NAPI_GRO_CB(skb)->encapsulation) - skb->encapsulation = 1; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->encapsulation = 0; + if (NAPI_GRO_CB(skb)->csum_cnt > 0) { + /* Consume a checksum from CHECKSUM_UNNECESSARY */ + NAPI_GRO_CB(skb)->csum_cnt--; + } else { + /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we + * verified a new top level checksum or an encapsulated one + * during GRO. This saves work if we fallback to normal path. + */ + __skb_incr_checksum_unnecessary(skb); } } diff --git a/net/core/dev.c b/net/core/dev.c index 26d296c2447c..a6077ef56345 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3962,13 +3962,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff gro_list_prepare(napi, skb); - if (skb->ip_summed == CHECKSUM_COMPLETE) { - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - } else { - NAPI_GRO_CB(skb)->csum_valid = 0; - } - rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) @@ -3980,7 +3973,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; - NAPI_GRO_CB(skb)->encapsulation = 0; + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d1bd16937d93..a4d7965fb880 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -172,12 +172,9 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, } /* Don't bother verifying checksum if we're going to flush anyway. */ - if (greh->flags & GRE_CSUM) { - if (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_simple_validate(skb)) + if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_simple_validate(skb)) goto out_unlock; - NAPI_GRO_CB(skb)->encapsulation++; - } flush = 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8ed460e3753c..a6adff98382a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -238,12 +238,13 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, int flush = 1; if (NAPI_GRO_CB(skb)->udp_mark || - (!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid)) + (skb->ip_summed != CHECKSUM_PARTIAL && + NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the udp gro layer */ NAPI_GRO_CB(skb)->udp_mark = 1; - NAPI_GRO_CB(skb)->encapsulation++; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); -- cgit v1.2.3 From cfdbeeafdbbdbc006f700e92cbad2cb5d4529f3d Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Tue, 22 Jul 2014 19:48:38 +0200 Subject: NFC: NCI: Add support of ISO15693 Update nci.h to respect latest NCI specification proposal (stop using proprietary opcodes). Handle ISO15693 parameters in NCI_RF_ACTIVATED_NTF handler. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- include/net/nfc/nci.h | 16 +++++++++++++--- net/nfc/nci/core.c | 8 ++++++++ net/nfc/nci/ntf.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index fbfa4e471abb..9eca9ae2280c 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -2,6 +2,7 @@ * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * + * Copyright (C) 2014 Marvell International Ltd. * Copyright (C) 2011 Texas Instruments, Inc. * * Written by Ilan Elias @@ -65,19 +66,18 @@ #define NCI_NFC_F_PASSIVE_POLL_MODE 0x02 #define NCI_NFC_A_ACTIVE_POLL_MODE 0x03 #define NCI_NFC_F_ACTIVE_POLL_MODE 0x05 -#define NCI_NFC_15693_PASSIVE_POLL_MODE 0x06 +#define NCI_NFC_V_PASSIVE_POLL_MODE 0x06 #define NCI_NFC_A_PASSIVE_LISTEN_MODE 0x80 #define NCI_NFC_B_PASSIVE_LISTEN_MODE 0x81 #define NCI_NFC_F_PASSIVE_LISTEN_MODE 0x82 #define NCI_NFC_A_ACTIVE_LISTEN_MODE 0x83 #define NCI_NFC_F_ACTIVE_LISTEN_MODE 0x85 -#define NCI_NFC_15693_PASSIVE_LISTEN_MODE 0x86 /* NCI RF Technologies */ #define NCI_NFC_RF_TECHNOLOGY_A 0x00 #define NCI_NFC_RF_TECHNOLOGY_B 0x01 #define NCI_NFC_RF_TECHNOLOGY_F 0x02 -#define NCI_NFC_RF_TECHNOLOGY_15693 0x03 +#define NCI_NFC_RF_TECHNOLOGY_V 0x03 /* NCI Bit Rates */ #define NCI_NFC_BIT_RATE_106 0x00 @@ -87,6 +87,7 @@ #define NCI_NFC_BIT_RATE_1695 0x04 #define NCI_NFC_BIT_RATE_3390 0x05 #define NCI_NFC_BIT_RATE_6780 0x06 +#define NCI_NFC_BIT_RATE_26 0x20 /* NCI RF Protocols */ #define NCI_RF_PROTOCOL_UNKNOWN 0x00 @@ -95,6 +96,7 @@ #define NCI_RF_PROTOCOL_T3T 0x03 #define NCI_RF_PROTOCOL_ISO_DEP 0x04 #define NCI_RF_PROTOCOL_NFC_DEP 0x05 +#define NCI_RF_PROTOCOL_T5T 0x06 /* NCI RF Interfaces */ #define NCI_RF_INTERFACE_NFCEE_DIRECT 0x00 @@ -328,6 +330,12 @@ struct rf_tech_specific_params_nfcf_poll { __u8 sensf_res[18]; /* 16 or 18 Bytes */ } __packed; +struct rf_tech_specific_params_nfcv_poll { + __u8 res_flags; + __u8 dsfid; + __u8 uid[8]; /* 8 Bytes */ +} __packed; + struct nci_rf_discover_ntf { __u8 rf_discovery_id; __u8 rf_protocol; @@ -338,6 +346,7 @@ struct nci_rf_discover_ntf { struct rf_tech_specific_params_nfca_poll nfca_poll; struct rf_tech_specific_params_nfcb_poll nfcb_poll; struct rf_tech_specific_params_nfcf_poll nfcf_poll; + struct rf_tech_specific_params_nfcv_poll nfcv_poll; } rf_tech_specific_params; __u8 ntf_type; @@ -372,6 +381,7 @@ struct nci_rf_intf_activated_ntf { struct rf_tech_specific_params_nfca_poll nfca_poll; struct rf_tech_specific_params_nfcb_poll nfcb_poll; struct rf_tech_specific_params_nfcf_poll nfcf_poll; + struct rf_tech_specific_params_nfcv_poll nfcv_poll; } rf_tech_specific_params; __u8 data_exch_rf_tech_and_mode; diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 2b400e1a8695..860080803a3e 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -231,6 +231,14 @@ static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt) cmd.num_disc_configs++; } + if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && + (protocols & NFC_PROTO_ISO15693_MASK)) { + cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = + NCI_NFC_V_PASSIVE_POLL_MODE; + cmd.disc_configs[cmd.num_disc_configs].frequency = 1; + cmd.num_disc_configs++; + } + nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_CMD, (1 + (cmd.num_disc_configs * sizeof(struct disc_config))), &cmd); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index df91bb95b12a..25e44cebd60a 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -2,6 +2,7 @@ * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * + * Copyright (C) 2014 Marvell International Ltd. * Copyright (C) 2011 Texas Instruments, Inc. * * Written by Ilan Elias @@ -155,6 +156,17 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, return data; } +static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcv_poll *nfcv_poll, + __u8 *data) +{ + ++data; + nfcv_poll->dsfid = *data++; + memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE); + data += NFC_ISO15693_UID_MAXSIZE; + return data; +} + static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, @@ -164,6 +176,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, struct rf_tech_specific_params_nfca_poll *nfca_poll; struct rf_tech_specific_params_nfcb_poll *nfcb_poll; struct rf_tech_specific_params_nfcf_poll *nfcf_poll; + struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; if (rf_protocol == NCI_RF_PROTOCOL_T1T) @@ -179,6 +192,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, protocol = NFC_PROTO_FELICA_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) protocol = NFC_PROTO_NFC_DEP_MASK; + else if (rf_protocol == NCI_RF_PROTOCOL_T5T) + protocol = NFC_PROTO_ISO15693_MASK; else protocol = 0; @@ -213,6 +228,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, memcpy(target->sensf_res, nfcf_poll->sensf_res, target->sensf_res_len); } + } else if (rf_tech_and_mode == NCI_NFC_V_PASSIVE_POLL_MODE) { + nfcv_poll = (struct rf_tech_specific_params_nfcv_poll *)params; + + target->is_iso15693 = 1; + target->iso15693_dsfid = nfcv_poll->dsfid; + memcpy(target->iso15693_uid, nfcv_poll->uid, NFC_ISO15693_UID_MAXSIZE); } else { pr_err("unsupported rf_tech_and_mode 0x%x\n", rf_tech_and_mode); return -EPROTO; @@ -305,6 +326,11 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; + case NCI_NFC_V_PASSIVE_POLL_MODE: + data = nci_extract_rf_params_nfcv_passive_poll(ndev, + &(ntf.rf_tech_specific_params.nfcv_poll), data); + break; + default: pr_err("unsupported rf_tech_and_mode 0x%x\n", ntf.rf_tech_and_mode); @@ -455,6 +481,11 @@ static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; + case NCI_NFC_V_PASSIVE_POLL_MODE: + data = nci_extract_rf_params_nfcv_passive_poll(ndev, + &(ntf.rf_tech_specific_params.nfcv_poll), data); + break; + default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf.activation_rf_tech_and_mode); -- cgit v1.2.3 From 10b3ad8c21bb4b135768c30dd4c51a1c744da699 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:07:24 -0700 Subject: net: Do txq_trans_update() in netdev_start_xmit() That way we don't have to audit every call site to make sure it is doing this properly. Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 6 ++++-- include/linux/netdevice.h | 10 ++++++++-- net/core/dev.c | 7 ++----- net/core/netpoll.c | 4 +--- net/core/pktgen.c | 3 +-- net/packet/af_packet.c | 7 ++----- net/sched/sch_teql.c | 3 +-- 7 files changed, 19 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 81b22a180aad..6427e8283419 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -192,8 +192,10 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) { struct dlci_local *dlp = netdev_priv(dev); - if (skb) - netdev_start_xmit(skb, dlp->slave); + if (skb) { + struct netdev_queue *txq = skb_get_tx_queue(dev, skb); + netdev_start_xmit(skb, dlp->slave, txq); + } return NETDEV_TX_OK; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 456eb1fe51e8..16171802ea7d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3437,11 +3437,17 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, return ops->ndo_start_xmit(skb, dev); } -static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) +static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; - return __netdev_start_xmit(ops, skb, dev); + rc = __netdev_start_xmit(ops, skb, dev); + if (rc == NETDEV_TX_OK) + txq_trans_update(txq); + + return rc; } int netdev_class_create_file_ns(struct class_attribute *class_attr, diff --git a/net/core/dev.c b/net/core/dev.c index a6077ef56345..6392adaaa22f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2666,10 +2666,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq); trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); return rc; } @@ -2685,7 +2683,7 @@ gso: skb_len = nskb->len; trace_net_dev_start_xmit(nskb, dev); - rc = netdev_start_xmit(nskb, dev); + rc = netdev_start_xmit(nskb, dev, txq); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2694,7 +2692,6 @@ gso: skb->next = nskb; return rc; } - txq_trans_update(txq); if (unlikely(netif_xmit_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 12b1df976562..05bc57edaa81 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -91,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = netdev_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); + status = netdev_start_xmit(skb, dev, txq); out: return status; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d81b540096c3..34bd2ff9f121 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3335,11 +3335,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq); switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b7a7f5a721bd..fe305a05a8fc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -258,11 +258,8 @@ static int packet_direct_xmit(struct sk_buff *skb) local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = netdev_start_xmit(skb, dev); - if (ret == NETDEV_TX_OK) - txq_trans_update(txq); - } + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 64cd93ca8104..193dc2cba1ec 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -316,8 +316,7 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - netdev_start_xmit(skb, slave) == NETDEV_TX_OK) { - txq_trans_update(slave_txq); + netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); -- cgit v1.2.3 From fa2dbdc253c2aee2a760c64de454cb62469ec11d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:55:22 -0700 Subject: net: Pass a "more" indication down into netdev_start_xmit() code paths. For now it will always be false. Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 2 +- include/linux/netdevice.h | 9 +++++---- net/atm/mpc.c | 2 +- net/core/dev.c | 2 +- net/core/netpoll.c | 2 +- net/core/pktgen.c | 2 +- net/packet/af_packet.c | 2 +- net/sched/sch_teql.c | 3 ++- 8 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 6427e8283419..ae6ecf401189 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -194,7 +194,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) if (skb) { struct netdev_queue *txq = skb_get_tx_queue(dev, skb); - netdev_start_xmit(skb, dlp->slave, txq); + netdev_start_xmit(skb, dlp->slave, txq, false); } return NETDEV_TX_OK; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 16171802ea7d..5050218c5b7f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3431,19 +3431,20 @@ int __init dev_proc_init(void); #endif static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + bool more) { - skb->xmit_more = 0; + skb->xmit_more = more ? 1 : 0; return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) + struct netdev_queue *txq, bool more) { const struct net_device_ops *ops = dev->netdev_ops; int rc; - rc = __netdev_start_xmit(ops, skb, dev); + rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index d662da161e5a..0e982222d425 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return __netdev_start_xmit(mpc->old_ops, skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev, false); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/core/dev.c b/net/core/dev.c index ab7bb809711e..f0ed5a611a97 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2610,7 +2610,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev, txq); + rc = netdev_start_xmit(skb, dev, txq, false); trace_net_dev_xmit(skb, rc, dev, len); return rc; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 05bc57edaa81..e6645b4f330a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -91,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = netdev_start_xmit(skb, dev, txq); + status = netdev_start_xmit(skb, dev, txq, false); out: return status; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 34bd2ff9f121..5b36a9428c59 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3335,7 +3335,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev, txq); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, false); switch (ret) { case NETDEV_TX_OK: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fe305a05a8fc..87d20f48ff06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -259,7 +259,7 @@ static int packet_direct_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) - ret = netdev_start_xmit(skb, dev, txq); + ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 193dc2cba1ec..aaa8d03ed054 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -316,7 +316,8 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) { + netdev_start_xmit(skb, slave, slave_txq, false) == + NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); -- cgit v1.2.3 From 50cbe9ab5f8d92d2d4a327b56e96559d8f63a1fa Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 19:13:51 -0700 Subject: net: Validate xmit SKBs right when we pull them out of the qdisc. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 6 +----- net/sched/sch_generic.c | 5 ++++- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5050218c5b7f..47c49ba2dcf4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2827,6 +2827,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 704a5434f77d..75bc5b068a13 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2656,7 +2656,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur return skb; } -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2719,10 +2719,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, { int rc = NETDEV_TX_OK; - skb = validate_xmit_skb(skb, dev); - if (!skb) - return rc; - if (likely(!skb->next)) return xmit_one(skb, dev, txq, false); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 05b3f5d104af..f178798a5836 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -70,8 +70,11 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) } else skb = NULL; } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { skb = q->dequeue(q); + if (skb) + skb = validate_xmit_skb(skb, qdisc_dev(q)); + } } return skb; -- cgit v1.2.3 From ce93718fb7cdbc064c3000ff59e4d3200bdfa744 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 19:22:20 -0700 Subject: net: Don't keep around original SKB when we software segment GSO frames. Just maintain the list properly by returning the head of the remaining SKB list from dev_hard_start_xmit(). Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +-- net/core/dev.c | 79 +++++++++-------------------------------------- net/sched/sch_generic.c | 2 +- 3 files changed, 17 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47c49ba2dcf4..202c25a9aadf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2828,8 +2828,8 @@ int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq); +struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, int *ret); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 75bc5b068a13..c89da4f306b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2485,52 +2485,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - kfree_skb_list(skb->next); - skb->next = NULL; - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ @@ -2682,8 +2636,13 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) features &= dev->hw_enc_features; if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + kfree_skb(skb); + if (IS_ERR(segs)) + segs = NULL; + skb = segs; } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) @@ -2714,26 +2673,16 @@ out_null: return NULL; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, int *ret) { - int rc = NETDEV_TX_OK; - - if (likely(!skb->next)) - return xmit_one(skb, dev, txq, false); - - skb->next = xmit_list(skb->next, dev, txq, &rc); - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + if (likely(!skb->next)) { + *ret = xmit_one(skb, dev, txq, false); + return skb; } - kfree_skb(skb); - - return rc; + return xmit_list(skb, dev, txq, ret); } -EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2945,7 +2894,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f178798a5836..a8bf9f9928bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -129,7 +129,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); -- cgit v1.2.3 From 5a21232983aa7acfe7fd26170832a9e0a4a7b4ae Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:41 -0700 Subject: net: Support for csum_bad in skbuff This flag indicates that an invalid checksum was detected in the packet. __skb_mark_checksum_bad helper function was added to set this. Checksums can be marked bad from a driver or the GRO path (the latter is implemented in this patch). csum_bad is checked in __skb_checksum_validate_complete (i.e. calling that when ip_summed == CHECKSUM_NONE). csum_bad works in conjunction with ip_summed value. In the case that ip_summed is CHECKSUM_NONE and csum_bad is set, this implies that the first (or next) checksum encountered in the packet is bad. When ip_summed is CHECKSUM_UNNECESSARY, the first checksum after the last one validated is bad. For example, if ip_summed == CHECKSUM_UNNECESSARY, csum_level == 1, and csum_bad is set-- then the third checksum in the packet is bad. In the normal path, the packet will be dropped when processing the protocol layer of the bad checksum: __skb_decr_checksum_unnecessary called twice for the good checksums changing ip_summed to CHECKSUM_NONE so that __skb_checksum_validate_complete is called to validate the third checksum and that will fail since csum_bad is set. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- include/linux/skbuff.h | 21 ++++++++++++++++++++- net/core/dev.c | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 202c25a9aadf..a0ab6d9d400a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2216,7 +2216,9 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ - if (!__ret) \ + if (__ret) \ + __skb_mark_checksum_bad(skb); \ + else \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c93b5859a772..23710a243439 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -617,7 +617,8 @@ struct sk_buff { kmemcheck_bitfield_begin(flags3); __u8 csum_level:2; - /* 14 bit hole */ + __u8 csum_bad:1; + /* 13 bit hole */ kmemcheck_bitfield_end(flags3); __be16 inner_protocol; @@ -2825,6 +2826,21 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_mark_checksum_bad(struct sk_buff *skb) +{ + /* Mark current checksum as bad (typically called from GRO + * path). In the case that ip_summed is CHECKSUM_NONE + * this must be the first checksum encountered in the packet. + * When ip_summed is CHECKSUM_UNNECESSARY, this is the first + * checksum after the last one validated. For UDP, a zero + * checksum can not be marked as bad. + */ + + if (skb->ip_summed == CHECKSUM_NONE || + skb->ip_summed == CHECKSUM_UNNECESSARY) + skb->csum_bad = 1; +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -2866,6 +2882,9 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, skb->csum_valid = 1; return 0; } + } else if (skb->csum_bad) { + /* ip_summed == CHECKSUM_NONE in this case */ + return 1; } skb->csum = psum; diff --git a/net/core/dev.c b/net/core/dev.c index 6857d57aa294..3774afc3bebf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3918,7 +3918,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); -- cgit v1.2.3 From d96535a17dbbafd567961d14c08c0984ddda9c3c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:42 -0700 Subject: net: Infrastructure for checksum unnecessary conversions For normal path, added skb_checksum_try_convert which is called to attempt to convert CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE. The primary condition to allow this is that ip_summed is CHECKSUM_NONE and csum_valid is true, which will be the state after consuming a CHECKSUM_UNNECESSARY. For GRO path, added skb_gro_checksum_try_convert which is the GRO analogue of skb_checksum_try_convert. The primary condition to allow this is that NAPI_GRO_CB(skb)->csum_cnt == 0 and NAPI_GRO_CB(skb)->csum_valid is set. This implies that we have consumed all available CHECKSUM_UNNECESSARY checksums in the GRO path. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ include/linux/skbuff.h | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a0ab6d9d400a..5be20a7bbb0d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2233,6 +2233,26 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) #define skb_gro_checksum_simple_validate(skb) \ __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) +static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) +{ + return (NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid); +} + +static inline void __skb_gro_checksum_convert(struct sk_buff *skb, + __sum16 check, __wsum pseudo) +{ + NAPI_GRO_CB(skb)->csum = ~pseudo; + NAPI_GRO_CB(skb)->csum_valid = 1; +} + +#define skb_gro_checksum_try_convert(skb, proto, check, compute_pseudo) \ +do { \ + if (__skb_gro_checksum_convert_check(skb)) \ + __skb_gro_checksum_convert(skb, check, \ + compute_pseudo(skb, proto)); \ +} while (0) + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 23710a243439..02529fcad1ac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2942,6 +2942,26 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) +static inline bool __skb_checksum_convert_check(struct sk_buff *skb) +{ + return (skb->ip_summed == CHECKSUM_NONE && + skb->csum_valid && !skb->csum_bad); +} + +static inline void __skb_checksum_convert(struct sk_buff *skb, + __sum16 check, __wsum pseudo) +{ + skb->csum = ~pseudo; + skb->ip_summed = CHECKSUM_COMPLETE; +} + +#define skb_checksum_try_convert(skb, proto, check, compute_pseudo) \ +do { \ + if (__skb_checksum_convert_check(skb)) \ + __skb_checksum_convert(skb, check, \ + compute_pseudo(skb, proto)); \ +} while (0) + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) -- cgit v1.2.3 From 2abb7cdc0dc84e99b76ef983a1ae1978922aa9b3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:43 -0700 Subject: udp: Add support for doing checksum unnecessary conversion Add support for doing CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion in UDP tunneling path. In the normal UDP path, we call skb_checksum_try_convert after locating the UDP socket. The check is that checksum conversion is enabled for the socket (new flag in UDP socket) and that checksum field is non-zero. In the UDP GRO path, we call skb_gro_checksum_try_convert after checksum is validated and checksum field is non-zero. Since this is already in GRO we assume that checksum conversion is always wanted. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/udp.h | 16 +++++++++++++++- net/ipv4/udp.c | 4 ++++ net/ipv4/udp_offload.c | 25 +++++++++++++++++-------- net/ipv6/udp.c | 4 ++++ net/ipv6/udp_offload.c | 24 +++++++++++++++++------- 5 files changed, 57 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 247cfdcc4b08..ee3277593222 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -49,7 +49,11 @@ struct udp_sock { unsigned int corkflag; /* Cork is required */ __u8 encap_type; /* Is this an Encapsulation socket? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ - no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */ + no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ + convert_csum:1;/* On receive, convert checksum + * unnecessary to checksum complete + * if possible. + */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -98,6 +102,16 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) return udp_sk(sk)->no_check6_rx; } +static inline void udp_set_convert_csum(struct sock *sk, bool val) +{ + udp_sk(sk)->convert_csum = val; +} + +static inline bool udp_get_convert_csum(struct sock *sk) +{ + return udp_sk(sk)->convert_csum; +} + #define udp_portaddr_for_each_entry(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3549c21fe5f7..0da3849fd35b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1788,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a6adff98382a..84e0e05c9c0e 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -290,16 +290,25 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, { struct udphdr *uh = udp_gro_udphdr(skb); - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (unlikely(!uh) || - (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, - inet_gro_compute_pseudo))) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + if (unlikely(!uh)) + goto flush; + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (!NAPI_GRO_CB(skb)->flush) + goto skip; + + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo); +skip: return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } int udp_gro_complete(struct sk_buff *skb, int nhoff) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 12fcce8fba46..f6ba535b6feb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -891,6 +891,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; } + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_compute_pseudo); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b13e377e9c53..89cb9a9b8537 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -134,16 +134,26 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, { struct udphdr *uh = udp_gro_udphdr(skb); + if (unlikely(!uh)) + goto flush; + /* Don't bother verifying checksum if we're going to flush anyway. */ - if (unlikely(!uh) || - (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, - ip6_gro_compute_pseudo))) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + if (!NAPI_GRO_CB(skb)->flush) + goto skip; + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo); + +skip: return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } int udp6_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3 From 364a9e93243d1785f310c0964af0e24bf1adac03 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 31 Aug 2014 21:30:27 -0400 Subject: sock: deduplicate errqueue dequeue sk->sk_error_queue is dequeued in four locations. All share the exact same logic. Deduplicate. Also collapse the two critical sections for dequeue (at the top of the recv handler) and signal (at the bottom). This moves signal generation for the next packet forward, which should be harmless. It also changes the behavior if the recv handler exits early with an error. Previously, a signal for follow-up packets on the errqueue would then not be scheduled. The new behavior, to always signal, is arguably a bug fix. For rxrpc, the change causes the same function to be called repeatedly for each queued packet (because the recv handler == sk_error_report). It is likely that all packets will fail for the same reason (e.g., memory exhaustion). This code runs without sk_lock held, so it is not safe to trust that sk->sk_err is immutable inbetween releasing q->lock and the subsequent test. Introduce int err just to avoid this potential race. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/skbuff.c | 20 ++++++++++++++++++++ net/core/sock.c | 14 ++------------ net/ipv4/ip_sockglue.c | 15 ++------------- net/ipv6/datagram.c | 15 ++------------- net/rxrpc/ar-error.c | 14 +------------- 6 files changed, 28 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 7f2ab72f321a..3fde6130789d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2041,6 +2041,7 @@ void sk_stop_timer(struct sock *sk, struct timer_list *timer); int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); +struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 163b673f9e62..53ce536e3d6e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3491,6 +3491,26 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) +{ + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next; + int err = 0; + + spin_lock_bh(&q->lock); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) + err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + spin_unlock_bh(&q->lock); + + sk->sk_err = err; + if (err) + sk->sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + void __skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) diff --git a/net/core/sock.c b/net/core/sock.c index f7f2352200ad..f1a638ee93d9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2488,11 +2488,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; int copied, err; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -2513,16 +2513,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5cb830c78990..455e75bcb167 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -405,7 +405,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; @@ -415,7 +415,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int copied; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -462,17 +462,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - skb2 = skb_peek(&sk->sk_error_queue); - if (skb2 != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1844e874a350..2cdc38338be3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -332,7 +332,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; @@ -342,7 +342,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int copied; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -415,17 +415,6 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else { - spin_unlock_bh(&sk->sk_error_queue.lock); - } - out_free_skb: kfree_skb(skb); out: diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index db57458c824c..74c0fcd36838 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -37,7 +37,7 @@ void rxrpc_UDP_error_report(struct sock *sk) _enter("%p{%d}", sk, local->debug_id); - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (!skb) { _leave("UDP socket errqueue empty"); return; @@ -111,18 +111,6 @@ void rxrpc_UDP_error_report(struct sock *sk) skb_queue_tail(&trans->error_queue, skb); rxrpc_queue_work(&trans->error_handler); - /* reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - skb = skb_peek(&sk->sk_error_queue); - if (skb) { - sk->sk_err = SKB_EXT_ERR(skb)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else { - spin_unlock_bh(&sk->sk_error_queue.lock); - } - _leave(""); } -- cgit v1.2.3 From b58555f1767c9f4e330fcf168e4e753d2d9196e0 Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Fri, 29 Aug 2014 16:16:04 +0200 Subject: xfrm: hash prefixed policies based on preflen thresholds The idea is an extension of the current policy hashing. Today only non-prefixed policies are stored in a hash table. This patch relaxes the constraints, and hashes policies whose prefix lengths are greater or equal to a configurable threshold. Each hash table (one per direction) maintains its own set of IPv4 and IPv6 thresholds (dbits4, sbits4, dbits6, sbits6), by default (32, 32, 128, 128). Example, if the output hash table is configured with values (16, 24, 56, 64): ip xfrm policy add dir out src 10.22.0.0/20 dst 10.24.1.0/24 ... => hashed ip xfrm policy add dir out src 10.22.0.0/16 dst 10.24.1.1/32 ... => hashed ip xfrm policy add dir out src 10.22.0.0/16 dst 10.24.0.0/16 ... => unhashed ip xfrm policy add dir out \ src 3ffe:304:124:2200::/60 dst 3ffe:304:124:2401::/64 ... => hashed ip xfrm policy add dir out \ src 3ffe:304:124:2200::/56 dst 3ffe:304:124:2401::2/128 ... => hashed ip xfrm policy add dir out \ src 3ffe:304:124:2200::/56 dst 3ffe:304:124:2400::/56 ... => unhashed The high order bits of the addresses (up to the threshold) are used to compute the hash key. Signed-off-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 4 +++ net/xfrm/xfrm_hash.h | 76 +++++++++++++++++++++++++++++++++++++++++------- net/xfrm/xfrm_policy.c | 53 +++++++++++++++++++++++++++++---- 3 files changed, 117 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 3492434baf88..41902a8103bd 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -13,6 +13,10 @@ struct ctl_table_header; struct xfrm_policy_hash { struct hlist_head *table; unsigned int hmask; + u8 dbits4; + u8 sbits4; + u8 dbits6; + u8 sbits6; }; struct netns_xfrm { diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 0622d319e1f2..666c5ffe929d 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -3,6 +3,7 @@ #include #include +#include static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) { @@ -28,6 +29,58 @@ static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, saddr->a6[2] ^ saddr->a6[3]); } +static inline u32 __bits2mask32(__u8 bits) +{ + u32 mask32 = 0xffffffff; + + if (bits == 0) + mask32 = 0; + else if (bits < 32) + mask32 <<= (32 - bits); + + return mask32; +} + +static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + __u8 dbits, + __u8 sbits) +{ + return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits), + ntohl(saddr->a4) & __bits2mask32(sbits), + 0); +} + +static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, + __u8 prefixlen) +{ + int pdw; + int pbi; + u32 initval = 0; + + pdw = prefixlen >> 5; /* num of whole u32 in prefix */ + pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ + + if (pbi) { + __be32 mask; + + mask = htonl((0xffffffff) << (32 - pbi)); + + initval = (__force u32)(addr->a6[pdw] & mask); + } + + return jhash2((__force u32 *)addr->a6, pdw, initval); +} + +static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + __u8 dbits, + __u8 sbits) +{ + return __xfrm6_pref_hash(daddr, dbits) ^ + __xfrm6_pref_hash(saddr, sbits); +} + static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family, @@ -84,7 +137,8 @@ static inline unsigned int __idx_hash(u32 index, unsigned int hmask) } static inline unsigned int __sel_hash(const struct xfrm_selector *sel, - unsigned short family, unsigned int hmask) + unsigned short family, unsigned int hmask, + u8 dbits, u8 sbits) { const xfrm_address_t *daddr = &sel->daddr; const xfrm_address_t *saddr = &sel->saddr; @@ -92,19 +146,19 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel, switch (family) { case AF_INET: - if (sel->prefixlen_d != 32 || - sel->prefixlen_s != 32) + if (sel->prefixlen_d < dbits || + sel->prefixlen_s < sbits) return hmask + 1; - h = __xfrm4_daddr_saddr_hash(daddr, saddr); + h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: - if (sel->prefixlen_d != 128 || - sel->prefixlen_s != 128) + if (sel->prefixlen_d < dbits || + sel->prefixlen_s < sbits) return hmask + 1; - h = __xfrm6_daddr_saddr_hash(daddr, saddr); + h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); @@ -113,17 +167,19 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel, static inline unsigned int __addr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, - unsigned short family, unsigned int hmask) + unsigned short family, + unsigned int hmask, + u8 dbits, u8 sbits) { unsigned int h = 0; switch (family) { case AF_INET: - h = __xfrm4_daddr_saddr_hash(daddr, saddr); + h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: - h = __xfrm6_daddr_saddr_hash(daddr, saddr); + h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index beeed602aeb3..e6ff7b4046ea 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -344,12 +344,39 @@ static inline unsigned int idx_hash(struct net *net, u32 index) return __idx_hash(index, net->xfrm.policy_idx_hmask); } +/* calculate policy hash thresholds */ +static void __get_hash_thresh(struct net *net, + unsigned short family, int dir, + u8 *dbits, u8 *sbits) +{ + switch (family) { + case AF_INET: + *dbits = net->xfrm.policy_bydst[dir].dbits4; + *sbits = net->xfrm.policy_bydst[dir].sbits4; + break; + + case AF_INET6: + *dbits = net->xfrm.policy_bydst[dir].dbits6; + *sbits = net->xfrm.policy_bydst[dir].sbits6; + break; + + default: + *dbits = 0; + *sbits = 0; + } +} + static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; - unsigned int hash = __sel_hash(sel, family, hmask); + unsigned int hash; + u8 dbits; + u8 sbits; + + __get_hash_thresh(net, family, dir, &dbits, &sbits); + hash = __sel_hash(sel, family, hmask, dbits, sbits); return (hash == hmask + 1 ? &net->xfrm.policy_inexact[dir] : @@ -362,25 +389,35 @@ static struct hlist_head *policy_hash_direct(struct net *net, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; - unsigned int hash = __addr_hash(daddr, saddr, family, hmask); + unsigned int hash; + u8 dbits; + u8 sbits; + + __get_hash_thresh(net, family, dir, &dbits, &sbits); + hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return net->xfrm.policy_bydst[dir].table + hash; } -static void xfrm_dst_hash_transfer(struct hlist_head *list, +static void xfrm_dst_hash_transfer(struct net *net, + struct hlist_head *list, struct hlist_head *ndsttable, - unsigned int nhashmask) + unsigned int nhashmask, + int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; + u8 dbits; + u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; + __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, - pol->family, nhashmask); + pol->family, nhashmask, dbits, sbits); if (!entry0) { hlist_del(&pol->bydst); hlist_add_head(&pol->bydst, ndsttable+h); @@ -434,7 +471,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) write_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) - xfrm_dst_hash_transfer(odst + i, ndst, nhashmask); + xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); net->xfrm.policy_bydst[dir].table = ndst; net->xfrm.policy_bydst[dir].hmask = nhashmask; @@ -2830,6 +2867,10 @@ static int __net_init xfrm_policy_init(struct net *net) if (!htab->table) goto out_bydst; htab->hmask = hmask; + htab->dbits4 = 32; + htab->sbits4 = 32; + htab->dbits6 = 128; + htab->sbits6 = 128; } INIT_LIST_HEAD(&net->xfrm.policy_all); -- cgit v1.2.3 From 880a6fab8f6ba5b5abe59ea68533202ddea1012c Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Fri, 29 Aug 2014 16:16:05 +0200 Subject: xfrm: configure policy hash table thresholds by netlink Enable to specify local and remote prefix length thresholds for the policy hash table via a netlink XFRM_MSG_NEWSPDINFO message. prefix length thresholds are specified by XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH optional attributes (struct xfrmu_spdhthresh). example: struct xfrmu_spdhthresh thresh4 = { .lbits = 0; .rbits = 24; }; struct xfrmu_spdhthresh thresh6 = { .lbits = 0; .rbits = 56; }; struct nlmsghdr *hdr; struct nl_msg *msg; msg = nlmsg_alloc(); hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, XFRMA_SPD_IPV4_HTHRESH, sizeof(__u32), NLM_F_REQUEST); nla_put(msg, XFRMA_SPD_IPV4_HTHRESH, sizeof(thresh4), &thresh4); nla_put(msg, XFRMA_SPD_IPV6_HTHRESH, sizeof(thresh6), &thresh6); nla_send_auto(sk, msg); The numbers are the policy selector minimum prefix lengths to put a policy in the hash table. - lbits is the local threshold (source address for out policies, destination address for in and fwd policies). - rbits is the remote threshold (destination address for out policies, source address for in and fwd policies). The default values are: XFRMA_SPD_IPV4_HTHRESH: 32 32 XFRMA_SPD_IPV6_HTHRESH: 128 128 Dynamic re-building of the SPD is performed when the thresholds values are changed. The current thresholds can be read via a XFRM_MSG_GETSPDINFO request: the kernel replies to XFRM_MSG_GETSPDINFO requests by an XFRM_MSG_NEWSPDINFO message, with both attributes XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH. Signed-off-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 10 ++++++ include/net/xfrm.h | 1 + include/uapi/linux/xfrm.h | 7 ++++ net/xfrm/xfrm_policy.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 80 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 182 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 41902a8103bd..9da798256f0e 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -19,6 +19,15 @@ struct xfrm_policy_hash { u8 sbits6; }; +struct xfrm_policy_hthresh { + struct work_struct work; + seqlock_t lock; + u8 lbits4; + u8 rbits4; + u8 lbits6; + u8 rbits6; +}; + struct netns_xfrm { struct list_head state_all; /* @@ -45,6 +54,7 @@ struct netns_xfrm { struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX * 2]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; + struct xfrm_policy_hthresh policy_hthresh; struct sock *nlsk; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 721e9c3b11bd..dc4865e90fe4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1591,6 +1591,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); +void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 25e5dd916ba4..02d5125a5ee8 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -328,6 +328,8 @@ enum xfrm_spdattr_type_t { XFRMA_SPD_UNSPEC, XFRMA_SPD_INFO, XFRMA_SPD_HINFO, + XFRMA_SPD_IPV4_HTHRESH, + XFRMA_SPD_IPV6_HTHRESH, __XFRMA_SPD_MAX #define XFRMA_SPD_MAX (__XFRMA_SPD_MAX - 1) @@ -347,6 +349,11 @@ struct xfrmu_spdhinfo { __u32 spdhmcnt; }; +struct xfrmu_spdhthresh { + __u8 lbits; + __u8 rbits; +}; + struct xfrm_usersa_info { struct xfrm_selector sel; struct xfrm_id id; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e6ff7b4046ea..55bcb8604bc6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -566,6 +566,86 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_rebuild(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, + xfrm.policy_hthresh.work); + unsigned int hmask; + struct xfrm_policy *pol; + struct xfrm_policy *policy; + struct hlist_head *chain; + struct hlist_head *odst; + struct hlist_node *newpos; + int i; + int dir; + unsigned seq; + u8 lbits4, rbits4, lbits6, rbits6; + + mutex_lock(&hash_resize_mutex); + + /* read selector prefixlen thresholds */ + do { + seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + lbits4 = net->xfrm.policy_hthresh.lbits4; + rbits4 = net->xfrm.policy_hthresh.rbits4; + lbits6 = net->xfrm.policy_hthresh.lbits6; + rbits6 = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); + + write_lock_bh(&net->xfrm.xfrm_policy_lock); + + /* reset the bydst and inexact table in all directions */ + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); + hmask = net->xfrm.policy_bydst[dir].hmask; + odst = net->xfrm.policy_bydst[dir].table; + for (i = hmask; i >= 0; i--) + INIT_HLIST_HEAD(odst + i); + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + /* dir out => dst = remote, src = local */ + net->xfrm.policy_bydst[dir].dbits4 = rbits4; + net->xfrm.policy_bydst[dir].sbits4 = lbits4; + net->xfrm.policy_bydst[dir].dbits6 = rbits6; + net->xfrm.policy_bydst[dir].sbits6 = lbits6; + } else { + /* dir in/fwd => dst = local, src = remote */ + net->xfrm.policy_bydst[dir].dbits4 = lbits4; + net->xfrm.policy_bydst[dir].sbits4 = rbits4; + net->xfrm.policy_bydst[dir].dbits6 = lbits6; + net->xfrm.policy_bydst[dir].sbits6 = rbits6; + } + } + + /* re-insert all policies by order of creation */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + newpos = NULL; + chain = policy_hash_bysel(net, &policy->selector, + policy->family, + xfrm_policy_id2dir(policy->index)); + hlist_for_each_entry(pol, chain, bydst) { + if (policy->priority >= pol->priority) + newpos = &pol->bydst; + else + break; + } + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, chain); + } + + write_unlock_bh(&net->xfrm.xfrm_policy_lock); + + mutex_unlock(&hash_resize_mutex); +} + +void xfrm_policy_hash_rebuild(struct net *net) +{ + schedule_work(&net->xfrm.policy_hthresh.work); +} +EXPORT_SYMBOL(xfrm_policy_hash_rebuild); + /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) @@ -2872,9 +2952,16 @@ static int __net_init xfrm_policy_init(struct net *net) htab->dbits6 = 128; htab->sbits6 = 128; } + net->xfrm.policy_hthresh.lbits4 = 32; + net->xfrm.policy_hthresh.rbits4 = 32; + net->xfrm.policy_hthresh.lbits6 = 128; + net->xfrm.policy_hthresh.rbits6 = 128; + + seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); + INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); if (net_eq(net, &init_net)) register_netdevice_notifier(&xfrm_dev_notifier); return 0; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d4db6ebb089d..eaf8a8f1cbe8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -964,7 +964,9 @@ static inline size_t xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_spdinfo)) - + nla_total_size(sizeof(struct xfrmu_spdhinfo)); + + nla_total_size(sizeof(struct xfrmu_spdhinfo)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)); } static int build_spdinfo(struct sk_buff *skb, struct net *net, @@ -973,9 +975,11 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, struct xfrmk_spdinfo si; struct xfrmu_spdinfo spc; struct xfrmu_spdhinfo sph; + struct xfrmu_spdhthresh spt4, spt6; struct nlmsghdr *nlh; int err; u32 *f; + unsigned lseq; nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); if (nlh == NULL) /* shouldn't really happen ... */ @@ -993,9 +997,22 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, sph.spdhcnt = si.spdhcnt; sph.spdhmcnt = si.spdhmcnt; + do { + lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + spt4.lbits = net->xfrm.policy_hthresh.lbits4; + spt4.rbits = net->xfrm.policy_hthresh.rbits4; + spt6.lbits = net->xfrm.policy_hthresh.lbits6; + spt6.rbits = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq)); + err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); if (!err) err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1004,6 +1021,51 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, return nlmsg_end(skb, nlh); } +static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrmu_spdhthresh *thresh4 = NULL; + struct xfrmu_spdhthresh *thresh6 = NULL; + + /* selector prefixlen thresholds to hash policies */ + if (attrs[XFRMA_SPD_IPV4_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh4)) + return -EINVAL; + thresh4 = nla_data(rta); + if (thresh4->lbits > 32 || thresh4->rbits > 32) + return -EINVAL; + } + if (attrs[XFRMA_SPD_IPV6_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh6)) + return -EINVAL; + thresh6 = nla_data(rta); + if (thresh6->lbits > 128 || thresh6->rbits > 128) + return -EINVAL; + } + + if (thresh4 || thresh6) { + write_seqlock(&net->xfrm.policy_hthresh.lock); + if (thresh4) { + net->xfrm.policy_hthresh.lbits4 = thresh4->lbits; + net->xfrm.policy_hthresh.rbits4 = thresh4->rbits; + } + if (thresh6) { + net->xfrm.policy_hthresh.lbits6 = thresh6->lbits; + net->xfrm.policy_hthresh.rbits6 = thresh6->rbits; + } + write_sequnlock(&net->xfrm.policy_hthresh.lock); + + xfrm_policy_hash_rebuild(net); + } + + return 0; +} + static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -2274,6 +2336,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), }; @@ -2308,10 +2371,17 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, }; +static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { + [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, + [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, +}; + static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); + const struct nla_policy *nla_pol; + int nla_max; } xfrm_dispatch[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, @@ -2335,6 +2405,9 @@ static const struct xfrm_link { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo, + .nla_pol = xfrma_spd_policy, + .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; @@ -2371,8 +2444,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } } - err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX, - xfrma_policy); + err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, + link->nla_max ? : XFRMA_MAX, + link->nla_pol ? : xfrma_policy); if (err < 0) return err; -- cgit v1.2.3 From 30766f4c2d60dd2a3fc67b7114174c417f43f4c6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 5 Aug 2014 20:02:42 +0200 Subject: netfilter: nat: move specific NAT IPv4 to core Move the specific NAT IPv4 core functions that are called from the hooks from iptable_nat.c to nf_nat_l3proto_ipv4.c. This prepares the ground to allow iptables and nft to use the same NAT engine code that comes in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l3proto.h | 38 +++++ net/ipv4/netfilter/iptable_nat.c | 233 +++++-------------------------- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 199 ++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 199 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index 5a2919b2e09a..bc2d51574489 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -42,6 +42,44 @@ const struct nf_nat_l3proto *__nf_nat_l3proto_find(u8 l3proto); int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum); + +unsigned int nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index f1787c04a4dd..6b67d7e9a75d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -28,222 +28,57 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) +static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int ret; - ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - ret = alloc_null_binding(ct, hooknum); - } - return ret; + return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table); } -static unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - - /* We never see fragments: conntrack defrags on pre-routing - * and local-out, and nf_nat_out protects post-routing. - */ - NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - unsigned int ret; - __be32 daddr = ip_hdr(skb)->daddr; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - daddr != ip_hdr(skb)->daddr) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if ((ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; + return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; + return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_in, + .hook = iptable_nat_ipv4_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, @@ -251,7 +86,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_out, + .hook = iptable_nat_ipv4_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, @@ -259,7 +94,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_local_fn, + .hook = iptable_nat_ipv4_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, @@ -267,7 +102,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_fn, + .hook = iptable_nat_ipv4_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 14f5ccd06337..fc37711e11f3 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -254,6 +254,205 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); +unsigned int +nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + /* maniptype == SRC for postrouting. */ + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + * and local-out, and nf_nat_out protects post-routing. + */ + NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + ops->hooknum)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = do_chain(ops, skb, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + + if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + break; + + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); + +unsigned int +nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + unsigned int ret; + __be32 daddr = ip_hdr(skb)->daddr; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + daddr != ip_hdr(skb)->daddr) + skb_dst_drop(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); + +unsigned int +nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); + +unsigned int +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn); + static int __init nf_nat_l3proto_ipv4_init(void) { int err; -- cgit v1.2.3 From 940001762ac514810e305aab356983829e5fa82a Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 3 Sep 2014 09:22:36 +0800 Subject: lib/rhashtable: allow user to set the minimum shifts of shrinking Although rhashtable library allows user to specify a quiet big size for user's created hash table, the table may be shrunk to a very small size - HASH_MIN_SIZE(4) after object is removed from the table at the first time. Subsequently, even if the total amount of objects saved in the table is quite lower than user's initial setting in a long time, the hash table size is still dynamically adjusted by rhashtable_shrink() or rhashtable_expand() each time object is inserted or removed from the table. However, as synchronize_rcu() has to be called when table is shrunk or expanded by the two functions, we should permit user to set the minimum table size through configuring the minimum number of shifts according to user specific requirement, avoiding these expensive actions of shrinking or expanding because of calling synchronize_rcu(). Signed-off-by: Ying Xue Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 ++ lib/rhashtable.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 36826c0166c5..fb298e9d6d3a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -44,6 +44,7 @@ struct rhashtable; * @head_offset: Offset of rhash_head in struct to be hashed * @hash_rnd: Seed to use while hashing * @max_shift: Maximum number of shifts while expanding + * @min_shift: Minimum number of shifts while shrinking * @hashfn: Function to hash key * @obj_hashfn: Function to hash object * @grow_decision: If defined, may return true if table should expand @@ -57,6 +58,7 @@ struct rhashtable_params { size_t head_offset; u32 hash_rnd; size_t max_shift; + size_t min_shift; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; bool (*grow_decision)(const struct rhashtable *ht, diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a2c78810ebc1..8dfec3f26d4c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -298,7 +298,7 @@ int rhashtable_shrink(struct rhashtable *ht, gfp_t flags) ASSERT_RHT_MUTEX(ht); - if (tbl->size <= HASH_MIN_SIZE) + if (ht->shift <= ht->p.min_shift) return 0; ntbl = bucket_table_alloc(tbl->size / 2, flags); @@ -506,9 +506,10 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash, } EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); -static size_t rounded_hashtable_size(unsigned int nelem) +static size_t rounded_hashtable_size(struct rhashtable_params *params) { - return max(roundup_pow_of_two(nelem * 4 / 3), HASH_MIN_SIZE); + return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + 1UL << params->min_shift); } /** @@ -566,8 +567,11 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) (!params->key_len && !params->obj_hashfn)) return -EINVAL; + params->min_shift = max_t(size_t, params->min_shift, + ilog2(HASH_MIN_SIZE)); + if (params->nelem_hint) - size = rounded_hashtable_size(params->nelem_hint); + size = rounded_hashtable_size(params); tbl = bucket_table_alloc(size, GFP_KERNEL); if (tbl == NULL) -- cgit v1.2.3 From 87fed556d08d21dd7dd3e0222c94c187e4c2d5e2 Mon Sep 17 00:00:00 2001 From: RafaÅ‚ MiÅ‚ecki Date: Wed, 3 Sep 2014 10:35:13 +0200 Subject: bcma: get info about flash type SoC booted from MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an ongoing work on cleaning MIPS's nvram support so it could be re-used on other platforms (bcm53xx to say precisely). This will require a bit of extra logic in bcma this patch implements. Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: John W. Linville --- drivers/bcma/driver_mips.c | 62 ++++++++++++++++++++++++++++++++++++++++++ include/linux/bcma/bcma_regs.h | 5 ++++ 2 files changed, 67 insertions(+) (limited to 'include') diff --git a/drivers/bcma/driver_mips.c b/drivers/bcma/driver_mips.c index 11115bbe115c..004d6aa671ce 100644 --- a/drivers/bcma/driver_mips.c +++ b/drivers/bcma/driver_mips.c @@ -21,6 +21,14 @@ #include #include +enum bcma_boot_dev { + BCMA_BOOT_DEV_UNK = 0, + BCMA_BOOT_DEV_ROM, + BCMA_BOOT_DEV_PARALLEL, + BCMA_BOOT_DEV_SERIAL, + BCMA_BOOT_DEV_NAND, +}; + static const char * const part_probes[] = { "bcm47xxpart", NULL }; static struct physmap_flash_data bcma_pflash_data = { @@ -229,11 +237,51 @@ u32 bcma_cpu_clock(struct bcma_drv_mips *mcore) } EXPORT_SYMBOL(bcma_cpu_clock); +static enum bcma_boot_dev bcma_boot_dev(struct bcma_bus *bus) +{ + struct bcma_drv_cc *cc = &bus->drv_cc; + u8 cc_rev = cc->core->id.rev; + + if (cc_rev == 42) { + struct bcma_device *core; + + core = bcma_find_core(bus, BCMA_CORE_NS_ROM); + if (core) { + switch (bcma_aread32(core, BCMA_IOST) & + BCMA_NS_ROM_IOST_BOOT_DEV_MASK) { + case BCMA_NS_ROM_IOST_BOOT_DEV_NOR: + return BCMA_BOOT_DEV_SERIAL; + case BCMA_NS_ROM_IOST_BOOT_DEV_NAND: + return BCMA_BOOT_DEV_NAND; + case BCMA_NS_ROM_IOST_BOOT_DEV_ROM: + default: + return BCMA_BOOT_DEV_ROM; + } + } + } else { + if (cc_rev == 38) { + if (cc->status & BCMA_CC_CHIPST_5357_NAND_BOOT) + return BCMA_BOOT_DEV_NAND; + else if (cc->status & BIT(5)) + return BCMA_BOOT_DEV_ROM; + } + + if ((cc->capabilities & BCMA_CC_CAP_FLASHT) == + BCMA_CC_FLASHT_PARA) + return BCMA_BOOT_DEV_PARALLEL; + else + return BCMA_BOOT_DEV_SERIAL; + } + + return BCMA_BOOT_DEV_SERIAL; +} + static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore) { struct bcma_bus *bus = mcore->core->bus; struct bcma_drv_cc *cc = &bus->drv_cc; struct bcma_pflash *pflash = &cc->pflash; + enum bcma_boot_dev boot_dev; switch (cc->capabilities & BCMA_CC_CAP_FLASHT) { case BCMA_CC_FLASHT_STSER: @@ -269,6 +317,20 @@ static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore) bcma_nflash_init(cc); } } + + /* Determine flash type this SoC boots from */ + boot_dev = bcma_boot_dev(bus); + switch (boot_dev) { + case BCMA_BOOT_DEV_PARALLEL: + case BCMA_BOOT_DEV_SERIAL: + /* TODO: Init NVRAM using BCMA_SOC_FLASH2 window */ + break; + case BCMA_BOOT_DEV_NAND: + /* TODO: Init NVRAM using BCMA_SOC_FLASH1 window */ + break; + default: + break; + } } void bcma_core_mips_early_init(struct bcma_drv_mips *mcore) diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h index 917dcd7965e7..e64ae7bf80a1 100644 --- a/include/linux/bcma/bcma_regs.h +++ b/include/linux/bcma/bcma_regs.h @@ -39,6 +39,11 @@ #define BCMA_RESET_CTL_RESET 0x0001 #define BCMA_RESET_ST 0x0804 +#define BCMA_NS_ROM_IOST_BOOT_DEV_MASK 0x0003 +#define BCMA_NS_ROM_IOST_BOOT_DEV_NOR 0x0000 +#define BCMA_NS_ROM_IOST_BOOT_DEV_NAND 0x0001 +#define BCMA_NS_ROM_IOST_BOOT_DEV_ROM 0x0002 + /* BCMA PCI config space registers. */ #define BCMA_PCI_PMCSR 0x44 #define BCMA_PCI_PE 0x100 -- cgit v1.2.3 From 2f711939d2ea9dfaecebecd1324d2ec7a7a21f65 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 2 Sep 2014 15:49:25 +0200 Subject: ipv6: add sysctl_mld_qrv to configure query robustness variable This patch adds a new sysctl_mld_qrv knob to configure the mldv1/v2 query robustness variable. It specifies how many retransmit of unsolicited mld retransmit should happen. Admins might want to tune this on lossy links. Also reset mld state on interface down/up, so we pick up new sysctl settings during interface up event. IPv6 certification requests this knob to be available. I didn't make this knob netns specific, as it is mostly a setting in a physical environment and should be per host. Cc: Flavio Leitner Signed-off-by: Hannes Frederic Sowa Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/ipv6.h | 1 + net/ipv6/mcast.c | 25 +++++++++++++++---------- net/ipv6/sysctl_net_ipv6.c | 10 ++++++++++ 4 files changed, 31 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 3cce8ea6b139..cfc71ac0f764 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1152,6 +1152,11 @@ anycast_src_echo_reply - BOOLEAN FALSE: disabled Default: FALSE +mld_qrv - INTEGER + Controls the MLD query robustness variable (see RFC3810 9.1). + Default: 2 (as specified by RFC3810 9.1) + Minimum: 1 (as specified by RFC6636 4.5) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a2db816e8461..7e247e9b8765 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -121,6 +121,7 @@ struct frag_hdr { /* sysctls */ extern int sysctl_mld_max_msf; +extern int sysctl_mld_qrv; #define _DEVINC(net, statname, modifier, idev, field) \ ({ \ diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 70881795da96..64919425f1ab 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -121,6 +121,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; +int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group @@ -1191,15 +1192,16 @@ static void mld_update_qrv(struct inet6_dev *idev, * and SHOULD NOT be one. Catch this here if we ever run * into such a case in future. */ + const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv); WARN_ON(idev->mc_qrv == 0); if (mlh2->mld2q_qrv > 0) idev->mc_qrv = mlh2->mld2q_qrv; - if (unlikely(idev->mc_qrv < 2)) { + if (unlikely(idev->mc_qrv < min_qrv)) { net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", - idev->mc_qrv, MLD_QRV_DEFAULT); - idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_qrv, min_qrv); + idev->mc_qrv = min_qrv; } } @@ -2478,6 +2480,14 @@ void ipv6_mc_down(struct inet6_dev *idev) mld_clear_delrec(idev); } +static void ipv6_mc_reset(struct inet6_dev *idev) +{ + idev->mc_qrv = sysctl_mld_qrv; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + idev->mc_v1_seen = 0; + idev->mc_maxdelay = unsolicited_report_interval(idev); +} /* Device going up */ @@ -2488,6 +2498,7 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ read_lock_bh(&idev->lock); + ipv6_mc_reset(idev); for (i = idev->mc_list; i; i = i->next) igmp6_group_added(i); read_unlock_bh(&idev->lock); @@ -2508,13 +2519,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) (unsigned long)idev); setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); - - idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_qi = MLD_QI_DEFAULT; - idev->mc_qri = MLD_QRI_DEFAULT; - - idev->mc_maxdelay = unsolicited_report_interval(idev); - idev->mc_v1_seen = 0; + ipv6_mc_reset(idev); write_unlock_bh(&idev->lock); } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 0c56c93619e0..c5c10fafcfe2 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,6 +16,8 @@ #include #include +static int one = 1; + static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", @@ -63,6 +65,14 @@ static struct ctl_table ipv6_rotable[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "mld_qrv", + .data = &sysctl_mld_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, { } }; -- cgit v1.2.3 From a9fe8e29945d56f35235a3a0fba99b4cf181d211 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 2 Sep 2014 15:49:26 +0200 Subject: ipv4: implement igmp_qrv sysctl to tune igmp robustness variable As in IPv6 people might increase the igmp query robustness variable to make sure unsolicited state change reports aren't lost on the network. Add and document this new knob to igmp code. RFCs allow tuning this parameter back to first IGMP RFC, so we also use this setting for all counters, including source specific multicast. Also take over sysctl value when upping the interface and don't reuse the last one seen on the interface. Cc: Flavio Leitner Signed-off-by: Hannes Frederic Sowa Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/linux/igmp.h | 1 + net/ipv4/igmp.c | 31 +++++++++++++++---------------- net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 4 files changed, 31 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index cfc71ac0f764..db2383cb1df9 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -844,6 +844,11 @@ igmp_max_memberships - INTEGER conf/all/* is special, changes the settings for all interfaces +igmp_qrv - INTEGER + Controls the IGMP query robustness variable (see RFC2236 8.1). + Default: 2 (as specified by RFC2236 8.1) + Minimum: 1 (as specified by RFC6636 4.5) + log_martians - BOOLEAN Log packets with impossible addresses to kernel log. log_martians for the interface will be enabled if at least one of diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f47550d75f85..2c677afeea47 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -39,6 +39,7 @@ static inline struct igmpv3_query * extern int sysctl_igmp_max_memberships; extern int sysctl_igmp_max_msf; +extern int sysctl_igmp_qrv; struct ip_sf_socklist { unsigned int sl_max; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 890c4258804c..4146153d875d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -117,7 +117,7 @@ #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) #define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Unsolicited_Report_Count 2 +#define IGMP_Query_Robustness_Variable 2 #define IGMP_Initial_Report_Delay (1) @@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev) { if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1322,7 +1319,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->unsolicit_count = sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; + in_dev->mr_qrv = sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_qrv = sysctl_igmp_qrv; +#endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) @@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) */ int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; - +#ifdef CONFIG_IP_MULTICAST +int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 79a007c52558..45d156dacd61 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -450,6 +450,16 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, -- cgit v1.2.3 From d98ad83ee86e523cc00cbf425f456fbd14b4fdc4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Sep 2014 15:24:57 +0300 Subject: mac80211: add Intel Mobile Communications copyright Our legal structure changed at some point (see wikipedia), but we forgot to immediately switch over to the new copyright notice. For files that we have modified in the time since the change, add the proper copyright notice now. Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 1 + net/mac80211/cfg.c | 1 + net/mac80211/debugfs.c | 1 + net/mac80211/debugfs_sta.c | 1 + net/mac80211/ibss.c | 1 + net/mac80211/ieee80211_i.h | 1 + net/mac80211/iface.c | 1 + net/mac80211/key.c | 1 + net/mac80211/main.c | 1 + net/mac80211/mlme.c | 1 + net/mac80211/rx.c | 1 + net/mac80211/scan.c | 1 + net/mac80211/sta_info.c | 1 + net/mac80211/sta_info.h | 1 + net/mac80211/status.c | 1 + net/mac80211/tdls.c | 1 + net/mac80211/tx.c | 1 + net/mac80211/util.c | 1 + net/mac80211/wme.c | 1 + 19 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c9b2bec8db47..db4975c33861 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4,6 +4,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4d8989b87960..d7f437ef6e7e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2,6 +2,7 @@ * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 0e963bc1ceac..574656174f91 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -3,6 +3,7 @@ * mac80211 debugfs for wireless PHYs * * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * GPLv2 * diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 4a20fb8f1e23..b5e0a5344e8c 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -2,6 +2,7 @@ * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 5f9654d31a8d..56b53571c807 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -6,6 +6,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2009, Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 78f95387e43f..b9d9508a6286 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -3,6 +3,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index bb7288e3c41c..af237223a8cd 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -5,6 +5,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2008, Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 6429d0e1d4a1..f320a04380b9 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2008 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/main.c b/net/mac80211/main.c index e0ab4320a078..0de7c93bf62b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -2,6 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 29fe91d6a094..305e5105aeb1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5,6 +5,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 41eb12c87240..b04ca4049c95 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index a9bb6eb8c3e0..af0d094b2f2f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -6,6 +6,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index e1f957d5935e..215752e8c5b6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 16dc1d414f69..9dc7a9d6b0d2 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,5 +1,6 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/status.c b/net/mac80211/status.c index aa06dcad336e..2800e1c65519 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2008-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f2cb3b6c1871..bcf51e43ccc1 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2014, Intel Corporation + * Copyright 2014 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 925c39f4099e..d1b3c223d6f9 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 725af7a468d2..b444f27a788e 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 6459946f0b74..3b873989992c 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -1,5 +1,6 @@ /* * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as -- cgit v1.2.3 From 2740f0cf8ec8bc7ee6a58f68841759e367dda98f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Sep 2014 15:24:58 +0300 Subject: cfg80211: add Intel Mobile Communications copyright Our legal structure changed at some point (see wikipedia), but we forgot to immediately switch over to the new copyright notice. For files that we have modified in the time since the change, add the proper copyright notice now. Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 1 + net/wireless/chan.c | 1 + net/wireless/core.c | 1 + net/wireless/nl80211.c | 1 + net/wireless/reg.c | 1 + net/wireless/scan.c | 1 + net/wireless/util.c | 1 + 8 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8018c915ee63..77d4f26e0235 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6,6 +6,7 @@ * Copyright (c) 2002-2003, Jouni Malinen * Copyright (c) 2005, Devicescape Software, Inc. * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ab21299c8f4d..6be68bb31918 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4,6 +4,7 @@ * 802.11 device and configuration interface * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 992b34070bcb..72d81e2154d5 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -4,6 +4,7 @@ * any point in time. * * Copyright 2009 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include diff --git a/net/wireless/core.c b/net/wireless/core.c index a207eb116829..9698fe709251 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -2,6 +2,7 @@ * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3011401f52c0..d876146498dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2,6 +2,7 @@ * This is the new netlink-based wireless configuration interface. * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 1afdf45db38f..8bfc9b172a49 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2007 Johannes Berg * Copyright 2008-2011 Luis R. Rodriguez + * Copyright 2013-2014 Intel Mobile Communications GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 620a4b40d466..bda39f149810 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2,6 +2,7 @@ * cfg80211 scan result handling * * Copyright 2008 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include #include diff --git a/net/wireless/util.c b/net/wireless/util.c index 728f1c0dc70d..a8b2816ffcb9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2,6 +2,7 @@ * Wireless utility functions * * Copyright 2007-2009 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include #include -- cgit v1.2.3 From 1c7e23bf50264a251de53ad9fb1604683b801258 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 3 Sep 2014 15:25:00 +0300 Subject: nl80211: Allow declaring RRM-related features Radio Resource Measurement (RRM) is a bundle of features which will require the entire stack to participate. In this patch, the driver is given the opportunity to advertise the device's support for these RRM-related features, using feature flags: 1. Support for Quiet IEs. 2. Support for adding DS Parameter Set IE to probe requests. 3. Support for adding WFA TPC Report IE to probe requests. 4. Support for inserting tx power value to tx-ed packets at a fixed offset. This is used in action frames, such as RRM's Link Measurement Report, where the actual tx power should be reported in the frame. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d097568da690..c166d3d23e55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3968,6 +3968,16 @@ enum nl80211_ap_sme_features { * @NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE: This driver supports dynamic * channel bandwidth change (e.g., HT 20 <-> 40 MHz channel) during the * lifetime of a BSS. + * @NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES: This device adds a DS Parameter + * Set IE to probe requests. + * @NL80211_FEATURE_WFA_TPC_IE_IN_PROBES: This device adds a WFA TPC Report IE + * to probe requests. + * @NL80211_FEATURE_QUIET: This device, in client mode, supports Quiet Period + * requests sent to it by an AP. + * @NL80211_FEATURE_TX_POWER_INSERTION: This device is capable of inserting the + * current tx power value into the TPC Report IE in the spectrum + * management TPC Report action frame, and in the Radio Measurement Link + * Measurement Report action frame. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -3989,6 +3999,10 @@ enum nl80211_feature_flags { NL80211_FEATURE_USERSPACE_MPM = 1 << 16, NL80211_FEATURE_ACTIVE_MONITOR = 1 << 17, NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE = 1 << 18, + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES = 1 << 19, + NL80211_FEATURE_WFA_TPC_IE_IN_PROBES = 1 << 20, + NL80211_FEATURE_QUIET = 1 << 21, + NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, }; /** -- cgit v1.2.3 From bab5ab7d2a5466406e8003d038cc7ce6b2d5d804 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 3 Sep 2014 15:25:01 +0300 Subject: nl80211: Add flag attribute for RRM connections Add a flag attribute to use in associations, for tagging the target connection as supporting RRM. It is the responsibility of upper layers to set this flag only if both the underlying device, and the target network indeed support RRM. To be used in ASSOCIATE and CONNECT commands. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 13 +++++++++++++ net/wireless/nl80211.c | 17 +++++++++++++++++ 3 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6be68bb31918..a887eeb5b31e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1608,10 +1608,12 @@ struct cfg80211_auth_request { * * @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n) * @ASSOC_REQ_DISABLE_VHT: Disable VHT + * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), ASSOC_REQ_DISABLE_VHT = BIT(1), + ASSOC_REQ_USE_RRM = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c166d3d23e55..de69d3df5e55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1594,6 +1594,17 @@ enum nl80211_commands { * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. * + * @NL80211_ATTR_USE_RRM: flag for indicating whether the current connection + * shall support Radio Resource Measurements (11k). This attribute can be + * used with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests. + * User space applications are expected to use this flag only if the + * underlying device supports these minimal RRM features: + * %NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES, + * %NL80211_FEATURE_QUIET, + * If this flag is used, driver must add the Power Capabilities IE to the + * association request. In addition, it must also set the RRM capability + * flag in the association request's Capability Info field. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1936,6 +1947,8 @@ enum nl80211_attrs { NL80211_ATTR_TDLS_INITIATOR, + NL80211_ATTR_USE_RRM, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d876146498dd..459dc2769d0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -389,6 +389,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, + [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -6584,6 +6585,14 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) sizeof(req.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + req.flags |= ASSOC_REQ_USE_RRM; + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); @@ -7241,6 +7250,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + connect.flags |= ASSOC_REQ_USE_RRM; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); -- cgit v1.2.3 From 3057dbfdab1b86a77ed6d512fc857b032f78663b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 4 Sep 2014 23:57:40 +0200 Subject: cfg80211: enable dynack through nl80211 Enable ACK timeout estimation algorithm (dynack) using mac80211 set_coverage_class API. Dynack is activated passing coverage class equals to -1 to lower drivers and it is automatically disabled setting valid value for coverage class. Define NL80211_ATTR_WIPHY_DYN_ACK flag attribute to enable dynack from userspace. In order to activate dynack NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower drivers to indicate dynack capability. Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 12 ++++++++++++ net/wireless/nl80211.c | 11 +++++++++++ 3 files changed, 25 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a887eeb5b31e..0d17ec9df692 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1805,6 +1805,7 @@ struct cfg80211_connect_params { * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed + * @WIPHY_PARAM_DYN_ACK: dynack has been enabled */ enum wiphy_params_flags { WIPHY_PARAM_RETRY_SHORT = 1 << 0, @@ -1812,6 +1813,7 @@ enum wiphy_params_flags { WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, + WIPHY_PARAM_DYN_ACK = 1 << 5, }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index de69d3df5e55..29c4399e08b9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1605,6 +1605,12 @@ enum nl80211_commands { * association request. In addition, it must also set the RRM capability * flag in the association request's Capability Info field. * + * @NL80211_ATTR_WIPHY_DYN_ACK: flag attribute used to enable ACK timeout + * estimation algorithm (dynack). In order to activate dynack + * %NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower + * drivers to indicate dynack capability. Dynack is automatically disabled + * setting valid value for coverage class. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1949,6 +1955,8 @@ enum nl80211_attrs { NL80211_ATTR_USE_RRM, + NL80211_ATTR_WIPHY_DYN_ACK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3991,6 +3999,9 @@ enum nl80211_ap_sme_features { * current tx power value into the TPC Report IE in the spectrum * management TPC Report action frame, and in the Radio Measurement Link * Measurement Report action frame. + * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout + * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used + * to enable dynack. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4016,6 +4027,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_WFA_TPC_IE_IN_PROBES = 1 << 20, NL80211_FEATURE_QUIET = 1 << 21, NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, + NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 459dc2769d0e..cf178d2b621d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -226,6 +226,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG }, [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, @@ -2239,11 +2240,21 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) + return -EINVAL; + coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); changed |= WIPHY_PARAM_COVERAGE_CLASS; } + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) + return -EOPNOTSUPP; + + changed |= WIPHY_PARAM_DYN_ACK; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; -- cgit v1.2.3 From a4bcaf5556da649f0160e60fa7b4bb2c29801c12 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 4 Sep 2014 23:57:41 +0200 Subject: mac80211: extend set_coverage_class signature Extend mac80211 set_coverage_class API in order to enable ACK timeout estimation algorithm (dynack) passing coverage class equals to -1 to lower drivers. Synchronize set_coverage_class routine signature with mac80211 function pointer for p54, ath9k, ath9k_htc and ath5k drivers. Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath5k/mac80211-ops.c | 2 +- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 2 +- drivers/net/wireless/ath/ath9k/main.c | 3 ++- drivers/net/wireless/p54/main.c | 3 ++- include/net/mac80211.h | 6 ++++-- net/mac80211/cfg.c | 9 +++++++-- net/mac80211/driver-ops.h | 2 +- net/mac80211/trace.h | 4 ++-- 8 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c index b65c38fdaa4b..ab2709a43768 100644 --- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c +++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c @@ -704,7 +704,7 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) * reset. */ static void -ath5k_set_coverage_class(struct ieee80211_hw *hw, u8 coverage_class) +ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) { struct ath5k_hw *ah = hw->priv; diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 5627917c5ff7..994fff1ff519 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1722,7 +1722,7 @@ static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, u32 value) } static void ath9k_htc_set_coverage_class(struct ieee80211_hw *hw, - u8 coverage_class) + s16 coverage_class) { struct ath9k_htc_priv *priv = hw->priv; diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index e6ac8d2e610c..97c028e36647 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1959,7 +1959,8 @@ static int ath9k_get_survey(struct ieee80211_hw *hw, int idx, return 0; } -static void ath9k_set_coverage_class(struct ieee80211_hw *hw, u8 coverage_class) +static void ath9k_set_coverage_class(struct ieee80211_hw *hw, + s16 coverage_class) { struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; diff --git a/drivers/net/wireless/p54/main.c b/drivers/net/wireless/p54/main.c index 7be3a4839640..97aeff0edb84 100644 --- a/drivers/net/wireless/p54/main.c +++ b/drivers/net/wireless/p54/main.c @@ -696,7 +696,8 @@ static void p54_flush(struct ieee80211_hw *dev, struct ieee80211_vif *vif, WARN(total, "tx flush timeout, unresponsive firmware"); } -static void p54_set_coverage_class(struct ieee80211_hw *dev, u8 coverage_class) +static void p54_set_coverage_class(struct ieee80211_hw *dev, + s16 coverage_class) { struct p54_common *priv = dev->priv; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index db4975c33861..c6e6a6804ee4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2673,7 +2673,9 @@ enum ieee80211_roc_type { * * @set_coverage_class: Set slot time for given coverage class as specified * in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout - * accordingly. This callback is not required and may sleep. + * accordingly; coverage class equals to -1 to enable ACK timeout + * estimation algorithm (dynack). To disable dynack set valid value for + * coverage class. This callback is not required and may sleep. * * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may * be %NULL. The callback can sleep. @@ -2957,7 +2959,7 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, u8 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4c1681bae232..101ae6cfad81 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1978,8 +1978,13 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) return err; } - if (changed & WIPHY_PARAM_COVERAGE_CLASS) { - err = drv_set_coverage_class(local, wiphy->coverage_class); + if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || + (changed & WIPHY_PARAM_DYN_ACK)) { + s16 coverage_class; + + coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? + wiphy->coverage_class : -1; + err = drv_set_coverage_class(local, coverage_class); if (err) return err; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 11423958116a..196d48c68134 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -450,7 +450,7 @@ static inline int drv_set_rts_threshold(struct ieee80211_local *local, } static inline int drv_set_coverage_class(struct ieee80211_local *local, - u8 value) + s16 value) { int ret = 0; might_sleep(); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 02ac535d1274..38fae7ebe984 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -672,13 +672,13 @@ DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, ); TRACE_EVENT(drv_set_coverage_class, - TP_PROTO(struct ieee80211_local *local, u8 value), + TP_PROTO(struct ieee80211_local *local, s16 value), TP_ARGS(local, value), TP_STRUCT__entry( LOCAL_ENTRY - __field(u8, value) + __field(s16, value) ), TP_fast_assign( -- cgit v1.2.3 From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 22:53:44 +0200 Subject: net: bpf: make eBPF interpreter images read-only With eBPF getting more extended and exposure to user space is on it's way, hardening the memory range the interpreter uses to steer its command flow seems appropriate. This patch moves the to be interpreted bytecode to read-only pages. In case we execute a corrupted BPF interpreter image for some reason e.g. caused by an attacker which got past a verifier stage, it would not only provide arbitrary read/write memory access but arbitrary function calls as well. After setting up the BPF interpreter image, its contents do not change until destruction time, thus we can setup the image on immutable made pages in order to mitigate modifications to that code. The idea is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). This is possible because bpf_prog is not part of sk_filter anymore. After setup bpf_prog cannot be altered during its life-time. This prevents any modifications to the entire bpf_prog structure (incl. function/JIT image pointer). Every eBPF program (including classic BPF that are migrated) have to call bpf_prog_select_runtime() to select either interpreter or a JIT image as a last setup step, and they all are being freed via bpf_prog_free(), including non-JIT. Therefore, we can easily integrate this into the eBPF life-time, plus since we directly allocate a bpf_prog, we have no performance penalty. Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual inspection of kernel_page_tables. Brad Spengler proposed the same idea via Twitter during development of this patch. Joint work with Hannes Frederic Sowa. Suggested-by: Brad Spengler Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 3 +- arch/mips/net/bpf_jit.c | 3 +- arch/powerpc/net/bpf_jit_comp.c | 3 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 3 +- arch/x86/net/bpf_jit_comp.c | 18 ++++------ include/linux/filter.h | 49 ++++++++++++++++++++++--- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++++++++++++++-- kernel/seccomp.c | 7 ++-- lib/test_bpf.c | 2 +- net/core/filter.c | 6 ++-- 11 files changed, 144 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a37b989a2f91..a76623bcf722 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -930,5 +930,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 05a56619ece2..cfa83cf2447d 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1427,5 +1427,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 3afa6f4c1957..40c53ff59124 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -697,5 +697,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 61e45b7c04d7..f2833c5b218a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -887,5 +887,5 @@ void bpf_jit_free(struct bpf_prog *fp) module_free(NULL, header); free_filter: - kfree(fp); + bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 1f76c22a6a75..f7a736b645e8 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -812,5 +812,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b08a98c59530..39ccfbb4a723 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -972,23 +972,17 @@ out: kfree(addrs); } -static void bpf_jit_free_deferred(struct work_struct *work) +void bpf_jit_free(struct bpf_prog *fp) { - struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; + if (!fp->jited) + goto free_filter; + set_memory_rw(addr, header->pages); module_free(NULL, header); - kfree(fp); -} -void bpf_jit_free(struct bpf_prog *fp) -{ - if (fp->jited) { - INIT_WORK(&fp->work, bpf_jit_free_deferred); - schedule_work(&fp->work); - } else { - kfree(fp); - } +free_filter: + bpf_prog_unlock_free(fp); } diff --git a/include/linux/filter.h b/include/linux/filter.h index a5227ab8ccb1..c78994593355 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -9,6 +9,11 @@ #include #include #include +#include + +struct sk_buff; +struct sock; +struct seccomp_data; /* Internally used and optimized filter representation with extended * instruction set based on top of classic BPF. @@ -320,20 +325,23 @@ struct sock_fprog_kern { struct sock_filter *filter; }; -struct sk_buff; -struct sock; -struct seccomp_data; +struct bpf_work_struct { + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_prog { + u32 pages; /* Number of allocated pages */ u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); + /* Instructions for interpreter */ union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; - struct work_struct work; }; }; @@ -353,6 +361,26 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ + set_memory_ro((unsigned long)fp, fp->pages); +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ + set_memory_rw((unsigned long)fp, fp->pages); +} +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} +#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ + int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); @@ -361,6 +389,17 @@ void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags); +void __bpf_prog_free(struct bpf_prog *fp); + +static inline void bpf_prog_unlock_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_ro(fp); + __bpf_prog_free(fp); +} + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); @@ -450,7 +489,7 @@ static inline void bpf_jit_compile(struct bpf_prog *fp) static inline void bpf_jit_free(struct bpf_prog *fp) { - kfree(fp); + bpf_prog_unlock_free(fp); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..b54bb2c2e494 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,6 +22,7 @@ */ #include #include +#include #include /* Registers */ @@ -63,6 +64,67 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_work_struct *ws; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); + if (ws == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->work = ws; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->work from fp_old around in the new + * reallocated structure. + */ + fp_old->work = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->work); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. @@ -523,12 +585,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -/* free internal BPF program */ +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_work_struct *ws; + + ws = container_of(work, struct bpf_work_struct, work); + bpf_jit_free(ws->prog); +} + +/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - bpf_jit_free(fp); + struct bpf_work_struct *ws = fp->work; + + INIT_WORK(&ws->work, bpf_prog_free_deferred); + ws->prog = fp; + schedule_work(&ws->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..84922befea84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,16 +395,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(bpf_prog_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; @@ -413,7 +412,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8c66c6aace04..9a67456ba29a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1836,7 +1836,7 @@ static struct bpf_prog *generate_filter(int which, int *err) break; case INTERNAL: - fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(flen), 0); if (fp == NULL) { pr_cont("UNEXPECTED_FAIL no memory left\n"); *err = -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0f..37f8eb06fdee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -933,7 +933,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1013,7 +1013,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,7 +1069,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; -- cgit v1.2.3 From f0db9b073415848709dd59a6394969882f517da9 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan <_govind@gmx.com> Date: Wed, 3 Sep 2014 03:17:20 +0530 Subject: ethtool: Add generic options for tunables This patch adds new ethtool cmd, ETHTOOL_GTUNABLE & ETHTOOL_STUNABLE for getting tunable values from driver. Add get_tunable and set_tunable to ethtool_ops. Driver implements these functions for getting/setting tunable value. Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 +++ include/uapi/linux/ethtool.h | 28 +++++++++++++++ net/core/ethtool.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e658229fee39..c1a2d60dfb82 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,6 +257,10 @@ struct ethtool_ops { struct ethtool_eeprom *, u8 *); int (*get_eee)(struct net_device *, struct ethtool_eee *); int (*set_eee)(struct net_device *, struct ethtool_eee *); + int (*get_tunable)(struct net_device *, + const struct ethtool_tunable *, void *); + int (*set_tunable)(struct net_device *, + const struct ethtool_tunable *, const void *); }; diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index e3c7a719c76b..7a364f2f3d3f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -209,6 +209,32 @@ struct ethtool_value { __u32 data; }; +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[0]; +}; + /** * struct ethtool_regs - hardware register dump * @cmd: Command number = %ETHTOOL_GREGS @@ -1152,6 +1178,8 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ #define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793fa..27e61b886520 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1621,6 +1621,80 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1744,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1932,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From caa415270c732505240bb60171c44a7838c555e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Sep 2014 22:21:56 -0700 Subject: ipv4: fix a race in update_or_create_fnhe() nh_exceptions is effectively used under rcu, but lacks proper barriers. Between kzalloc() and setting of nh->nh_exceptions(), we need a proper memory barrier. Signed-off-by: Eric Dumazet Fixes: 4895c771c7f00 ("ipv4: Add FIB nexthop exceptions.") Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- net/ipv4/fib_semantics.c | 8 +++++--- net/ipv4/route.c | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9922093f575e..f30fd554127e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -87,7 +87,7 @@ struct fib_nh { int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; - struct fnhe_hash_bucket *nh_exceptions; + struct fnhe_hash_bucket __rcu *nh_exceptions; }; /* diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b10cd43a4722..5b6efb3d2308 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -157,9 +157,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) static void free_nh_exceptions(struct fib_nh *nh) { - struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fnhe_hash_bucket *hash; int i; + hash = rcu_dereference_protected(nh->nh_exceptions, 1); + if (!hash) + return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; @@ -205,8 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); - if (nexthop_nh->nh_exceptions) - free_nh_exceptions(nexthop_nh); + free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index eaa4b000c7b4..44b0cbdd76f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -628,12 +628,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, spin_lock_bh(&fnhe_lock); - hash = nh->nh_exceptions; + hash = rcu_dereference(nh->nh_exceptions); if (!hash) { hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; - nh->nh_exceptions = hash; + rcu_assign_pointer(nh->nh_exceptions, hash); } hash += hval; @@ -1242,7 +1242,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { - struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); struct fib_nh_exception *fnhe; u32 hval; -- cgit v1.2.3 From d546c621542df9e45eedc91f35356e887ac63b7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Sep 2014 08:21:31 -0700 Subject: ipv4: harden fnhe_hashfun() Lets make this hash function a bit secure, as ICMP attacks are still in the wild. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- net/ipv4/route.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f30fd554127e..dc9d2a27c315 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -65,7 +65,8 @@ struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; -#define FNHE_HASH_SIZE 2048 +#define FNHE_HASH_SHIFT 11 +#define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 44b0cbdd76f1..234a43e233dc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -596,12 +596,12 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) static inline u32 fnhe_hashfun(__be32 daddr) { + static u32 fnhe_hashrnd __read_mostly; u32 hval; - hval = (__force u32) daddr; - hval ^= (hval >> 11) ^ (hval >> 22); - - return hval & (FNHE_HASH_SIZE - 1); + net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); + hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); + return hash_32(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) -- cgit v1.2.3 From 62bccb8cdb69051b95a55ab0c489e3cab261c8ef Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 4 Sep 2014 13:31:35 -0400 Subject: net-timestamp: Make the clone operation stand-alone from phy timestamping The phy timestamping takes a different path than the regular timestamping does in that it will create a clone first so that the packets needing to be timestamped can be placed in a queue, or the context block could be used. In order to support these use cases I am pulling the core of the code out so it can be used in other drivers beyond just phy devices. In addition I have added a destructor named sock_efree which is meant to provide a simple way for dropping the reference to skb exceptions that aren't part of either the receive or send windows for the socket, and I have removed some duplication in spots where this destructor could be used in place of sock_edemux. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 6 +++--- include/linux/skbuff.h | 2 ++ include/net/sock.h | 1 + net/core/skbuff.c | 32 +++++++++++++++++++++++++------- net/core/sock.c | 6 ++++++ net/core/timestamping.c | 14 +++----------- 6 files changed, 40 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index d5991ac46ab0..87648b306551 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -1148,7 +1148,7 @@ static void dp83640_remove(struct phy_device *phydev) kfree_skb(skb); while ((skb = skb_dequeue(&dp83640->tx_queue)) != NULL) - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); clock = dp83640_clock_get(dp83640->clock); @@ -1405,7 +1405,7 @@ static void dp83640_txtstamp(struct phy_device *phydev, case HWTSTAMP_TX_ONESTEP_SYNC: if (is_sync(skb, type)) { - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); return; } /* fall through */ @@ -1416,7 +1416,7 @@ static void dp83640_txtstamp(struct phy_device *phydev, case HWTSTAMP_TX_OFF: default: - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); break; } } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 02529fcad1ac..1cf0cfaef10a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2690,6 +2690,8 @@ static inline ktime_t net_invalid_timestamp(void) return ktime_set(0, 0); } +struct sk_buff *skb_clone_sk(struct sk_buff *skb); + #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index 3fde6130789d..e02be37a3d91 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1574,6 +1574,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, void sock_wfree(struct sk_buff *skb); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); +void sock_efree(struct sk_buff *skb); void sock_edemux(struct sk_buff *skb); int sock_setsockopt(struct socket *sock, int level, int op, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 697e696c914b..a936a401564e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3511,6 +3511,27 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) } EXPORT_SYMBOL(sock_dequeue_err_skb); +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; + } + + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, int tstype) @@ -3540,14 +3561,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, { struct sock *sk = skb->sk; - skb->sk = NULL; + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); - if (hwtstamps) { - *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); - } else { - kfree_skb(skb); - } + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); sock_put(sk); } diff --git a/net/core/sock.c b/net/core/sock.c index f1a638ee93d9..d04005c51724 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1637,6 +1637,12 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index f48a59fd8f39..43d3dd62fcc8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk) return; type = classify(skb); @@ -48,16 +47,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) + clone = skb_clone_sk(skb); + if (!clone) return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } } -- cgit v1.2.3 From 82eabd9eb2ec1603282a2c3f74dfcb6fe0aaea0e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 4 Sep 2014 13:32:11 -0400 Subject: net: merge cases where sock_efree and sock_edemux are the same function Since sock_efree and sock_demux are essentially the same code for non-TCP sockets and the case where CONFIG_INET is not defined we can combine the code or replace the call to sock_edemux in several spots. As a result we can avoid a bit of unnecessary code or code duplication. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++++ net/core/sock.c | 4 ++-- net/ipv4/udp.c | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index e02be37a3d91..ad23e80cb8d3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1575,7 +1575,11 @@ void sock_wfree(struct sk_buff *skb); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); +#else +#define sock_edemux(skb) sock_efree(skb) +#endif int sock_setsockopt(struct socket *sock, int level, int op, char __user *optval, unsigned int optlen); diff --git a/net/core/sock.c b/net/core/sock.c index d04005c51724..69592cb66e3b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1643,18 +1643,18 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; -#ifdef CONFIG_INET if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_put(inet_twsk(sk)); else -#endif sock_put(sk); } EXPORT_SYMBOL(sock_edemux); +#endif kuid_t sock_i_uid(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0da3849fd35b..cd0db5471bb5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1972,7 +1972,7 @@ void udp_v4_early_demux(struct sk_buff *skb) return; skb->sk = sk; - skb->destructor = sock_edemux; + skb->destructor = sock_efree; dst = sk->sk_rx_dst; if (dst) -- cgit v1.2.3 From 56193d1bce2b2759cb4bdcc00cd05544894a0c90 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 5 Sep 2014 19:20:26 -0400 Subject: net: Add function for parsing the header length out of linear ethernet frames This patch updates some of the flow_dissector api so that it can be used to parse the length of ethernet buffers stored in fragments. Most of the changes needed were to __skb_get_poff as it needed to be updated to support sending a linear buffer instead of a skb. I have split __skb_get_poff into two functions, the first is skb_get_poff and it retains the functionality of the original __skb_get_poff. The other function is __skb_get_poff which now works much like __skb_flow_dissect in relation to skb_flow_dissect in that it provides the same functionality but works with just a data buffer and hlen instead of needing an skb. Signed-off-by: Alexander Duyck Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 1 + include/linux/skbuff.h | 4 +++- include/net/flow_keys.h | 2 ++ net/core/filter.c | 2 +- net/core/flow_dissector.c | 46 +++++++++++++++++++++++++++++++-------------- net/ethernet/eth.c | 27 ++++++++++++++++++++++++++ 6 files changed, 66 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 9c5529dc6d07..733980fce8e3 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,6 +29,7 @@ #include #ifdef __KERNEL__ +u32 eth_get_headlen(void *data, unsigned int max_len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1cf0cfaef10a..07c9fdd0c126 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3218,7 +3218,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); -u32 __skb_get_poff(const struct sk_buff *skb); +u32 skb_get_poff(const struct sk_buff *skb); +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen); /** * skb_head_is_locked - Determine if the skb->head is locked down diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 9a03f73c4974..7ee2df083542 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -40,4 +40,6 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } u32 flow_hash_from_keys(struct flow_keys *keys); +unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len, + __be16 protocol); #endif diff --git a/net/core/filter.c b/net/core/filter.c index 37f8eb06fdee..fa5b7d0f77ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,7 +113,7 @@ static unsigned int pkt_type_offset(void) static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { - return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff((struct sk_buff *)(unsigned long) ctx); } static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 12f48ca7a0b0..8560dea58803 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include #include #include +#include /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; @@ -117,6 +118,13 @@ ipv6: flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + /* skip the flow label processing if skb is NULL. The + * assumption here is that if there is no skb we are not + * looking for flow info as much as we are length. + */ + if (!skb) + break; + flow_label = ip6_flowlabel(iph); if (flow_label) { /* Awesome, IPv6 packet has a flow label so we can @@ -165,6 +173,9 @@ ipv6: return false; } } + case htons(ETH_P_FCOE): + flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + /* fall through */ default: return false; } @@ -316,26 +327,18 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(__skb_tx_hash); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) { - struct flow_keys keys; - u32 poff = 0; + u32 poff = keys->thoff; - if (!skb_flow_dissect(skb, &keys)) - return 0; - - poff += keys.thoff; - switch (keys.ip_proto) { + switch (keys->ip_proto) { case IPPROTO_TCP: { const struct tcphdr *tcph; struct tcphdr _tcph; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); + tcph = __skb_header_pointer(skb, poff, sizeof(_tcph), + data, hlen, &_tcph); if (!tcph) return poff; @@ -369,6 +372,21 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } +/* skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 5cebca134585..33a140e15834 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -145,6 +145,33 @@ int eth_rebuild_header(struct sk_buff *skb) } EXPORT_SYMBOL(eth_rebuild_header); +/** + * eth_get_headlen - determine the the length of header for an ethernet frame + * @data: pointer to start of frame + * @len: total length of frame + * + * Make a best effort attempt to pull the length for all of the headers for + * a given frame in a linear buffer. + */ +u32 eth_get_headlen(void *data, unsigned int len) +{ + const struct ethhdr *eth = (const struct ethhdr *)data; + struct flow_keys keys; + + /* this should never happen, but better safe than sorry */ + if (len < sizeof(*eth)) + return len; + + /* parse any remaining L2/L3 headers, check for L4 */ + if (!__skb_flow_dissect(NULL, &keys, data, + eth->h_proto, sizeof(*eth), len)) + return max_t(u32, keys.thoff, sizeof(*eth)); + + /* parse for any L4 headers */ + return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); +} +EXPORT_SYMBOL(eth_get_headlen); + /** * eth_type_trans - determine the packet's protocol ID. * @skb: received socket data -- cgit v1.2.3 From 04317dafd11dd7b0ec19b85f098414abae6ed5f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2014 15:33:32 -0700 Subject: tcp: introduce TCP_SKB_CB(skb)->tcp_tw_isn TCP_SKB_CB(skb)->when has different meaning in output and input paths. In output path, it contains a timestamp. In input path, it contains an ISN, chosen by tcp_timewait_state_process() Lets add a different name to ease code comprehension. Note that 'when' field will disappear in following patch, as skb_mstamp already contains timestamp, the anonymous union will promptly disappear as well. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 590e01a476ac..0cd7d2c65dc0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -698,7 +698,12 @@ struct tcp_skb_cb { } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 when; /* used to compute rtt's */ + union { + /* used in output path */ + __u32 when; /* used to compute rtt's */ + /* used in input path */ + __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ + }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index aba4926ca095..9c8b9f1dcf69 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5906,7 +5906,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->when; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; bool want_cookie = false, fastopen; struct flowi fl; struct tcp_fastopen_cookie foc = { .len = -1 }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 487e2a41667f..02e6cd29ebf1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1649988bd1b6..a058f411d3a6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -232,7 +232,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->when = isn; + TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 29964c3d363c..5b3c70ff7a72 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -738,7 +738,7 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); - if (!TCP_SKB_CB(skb)->when && + if (!TCP_SKB_CB(skb)->tcp_tw_isn && (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { @@ -1412,7 +1412,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; -- cgit v1.2.3 From 7faee5c0d514162853a343d93e4a0b6bb8bfec21 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2014 15:33:33 -0700 Subject: tcp: remove TCP_SKB_CB(skb)->when After commit 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution"), we no longer need to maintain timestamps in two different fields. TCP_SKB_CB(skb)->when can be removed, as same information sits in skb_mstamp.stamp_jiffies Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++------ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 39 ++++++++++++++++----------------------- net/ipv4/tcp_timer.c | 7 +++---- 5 files changed, 31 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0cd7d2c65dc0..a4201ef216e8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,6 +672,12 @@ void tcp_send_window_probe(struct sock *sk); */ #define tcp_time_stamp ((__u32)(jiffies)) +static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) +{ + return skb->skb_mstamp.stamp_jiffies; +} + + #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 @@ -698,12 +704,7 @@ struct tcp_skb_cb { } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - union { - /* used in output path */ - __u32 when; /* used to compute rtt's */ - /* used in input path */ - __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ - }; + __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c8b9f1dcf69..f97003ad0af5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2967,7 +2967,8 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + const u32 rto_time_stamp = + tcp_skb_timestamp(skb) + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02e6cd29ebf1..3f9bc3f0bba0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -437,8 +437,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); - remaining = icsk->icsk_rto - min(icsk->icsk_rto, - tcp_time_stamp - TCP_SKB_CB(skb)->when); + remaining = icsk->icsk_rto - + min(icsk->icsk_rto, + tcp_time_stamp - tcp_skb_timestamp(skb)); if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5a7c41fbc6d3..3b22dcb7bb5c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -550,7 +550,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; + opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -618,7 +618,7 @@ static unsigned int tcp_synack_options(struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = tcp_skb_timestamp(skb); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,7 +647,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { - struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; @@ -666,7 +665,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -886,8 +885,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; - /* Our usage of tstamp should remain private */ - skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -975,7 +972,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + if (likely(err <= 0)) return err; @@ -1149,7 +1149,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ - TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1874,8 +1873,8 @@ static int tcp_mtu_probe(struct sock *sk) tcp_init_tso_segs(sk, nskb, nskb->len); /* We're ready to send. If this fails, the probe will - * be resegmented into mss-sized pieces by tcp_write_xmit(). */ - TCP_SKB_CB(nskb)->when = tcp_time_stamp; + * be resegmented into mss-sized pieces by tcp_write_xmit(). + */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ @@ -1935,8 +1934,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "when" is used as a start point for the retransmit timer */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* "skb_mstamp" is used as a start point for the retransmit timer */ + skb_mstamp_get(&skb->skb_mstamp); goto repair; /* Skip network transmission */ } @@ -2000,8 +1999,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2499,7 +2496,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -2544,7 +2540,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = TCP_SKB_CB(skb)->when; + tp->retrans_stamp = tcp_skb_timestamp(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). @@ -2752,7 +2748,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -2791,7 +2786,6 @@ int tcp_send_synack(struct sock *sk) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } - TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2835,10 +2829,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, foc) + sizeof(*th); @@ -3086,7 +3080,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); @@ -3194,7 +3188,7 @@ void tcp_send_ack(struct sock *sk) tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } @@ -3226,7 +3220,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3270,7 +3264,6 @@ int tcp_write_wakeup(struct sock *sk) tcp_set_skb_tso_segs(sk, skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index df90cd1ce37f..a339e7ba05a4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - if (unlikely(!tcp_sk(sk)->retrans_stamp)) - start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; - else - start_ts = tcp_sk(sk)->retrans_stamp; + start_ts = tcp_sk(sk)->retrans_stamp; + if (unlikely(!start_ts)) + start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); -- cgit v1.2.3 From c8d6591752e96c550cb98b781326d72d8eedcc79 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Wed, 3 Sep 2014 06:48:37 -0700 Subject: mac80211: support DTPC IE (from Cisco Client eXtensions) Linux already supports 802.11h, where the access point can tell the client to reduce its transmission power. However, 802.11h is only defined for 5 GHz, where the need for this is much smaller than on 2.4 GHz. Cisco has their own solution, called DTPC (Dynamic Transmit Power Control). Cisco APs on a controller sometimes but not always send 802.11h; they always send DTPC, even on 2.4 GHz. This patch adds support for parsing and honoring the DTPC IE in addition to the 802.11h element (they do not always contain the same limits, so both must be honored); the format is not documented, but very simple. Tested (on top of wireless.git and on 3.16.1) against a Cisco Aironet 1142 joined to a Cisco 2504 WLC, by setting various transmit power levels for the given access points and observing the results. The Wireshark 802.11 dissector agrees with the interpretation of the element, except for negative numbers, which seem to never happen anyway. Signed-off-by: Steinar H. Gunderson Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 ++- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 60 +++++++++++++++++++++++++++++++++++++--------- net/mac80211/util.c | 25 +++++++++++++++++++ 4 files changed, 77 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 77d4f26e0235..61a09f0644aa 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1824,7 +1824,8 @@ enum ieee80211_eid { WLAN_EID_DMG_TSPEC = 146, WLAN_EID_DMG_AT = 147, WLAN_EID_DMG_CAP = 148, - /* 149-150 reserved for Cisco */ + /* 149 reserved for Cisco */ + WLAN_EID_CISCO_VENDOR_SPECIFIC = 150, WLAN_EID_DMG_OPERATION = 151, WLAN_EID_DMG_BSS_PARAM_CHANGE = 152, WLAN_EID_DMG_BEAM_REFINEMENT = 153, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6b7bdd8fae29..c2aaec4dfcf0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1369,6 +1369,7 @@ struct ieee802_11_elems { const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *country_elem; const u8 *pwr_constr_elem; + const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0b812a88f95f..a7b92f5f7161 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1230,14 +1230,30 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, return have_chan_pwr; } +static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + const u8 *cisco_dtpc_ie, + int *pwr_level) +{ + /* From practical testing, the first data byte of the DTPC element + * seems to contain the requested dBm level, and the CLI on Cisco + * APs clearly state the range is -127 to 127 dBm, which indicates + * a signed byte, although it seemingly never actually goes negative. + * The other byte seems to always be zero. + */ + *pwr_level = (__s8)cisco_dtpc_ie[4]; +} + static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, - const u8 *pwr_constr_ie) + const u8 *pwr_constr_ie, + const u8 *cisco_dtpc_ie) { - bool has_80211h_pwr = false; - int chan_pwr, pwr_reduction_80211h; + bool has_80211h_pwr = false, has_cisco_pwr = false; + int chan_pwr = 0, pwr_reduction_80211h = 0; + int pwr_level_cisco, pwr_level_80211h; int new_ap_level; if (country_ie && pwr_constr_ie && @@ -1246,16 +1262,35 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, has_80211h_pwr = ieee80211_find_80211h_pwr_constr( sdata, channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); - new_ap_level = max_t(int, 0, chan_pwr - pwr_reduction_80211h); + pwr_level_80211h = + max_t(int, 0, chan_pwr - pwr_reduction_80211h); + } + + if (cisco_dtpc_ie) { + ieee80211_find_cisco_dtpc( + sdata, channel, cisco_dtpc_ie, &pwr_level_cisco); + has_cisco_pwr = true; } - if (!has_80211h_pwr) + if (!has_80211h_pwr && !has_cisco_pwr) return 0; - sdata_info(sdata, - "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", - new_ap_level, chan_pwr, pwr_reduction_80211h, - sdata->u.mgd.bssid); + /* If we have both 802.11h and Cisco DTPC, apply both limits + * by picking the smallest of the two power levels advertised. + */ + if (has_80211h_pwr && + (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { + sdata_info(sdata, + "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", + pwr_level_80211h, chan_pwr, pwr_reduction_80211h, + sdata->u.mgd.bssid); + new_ap_level = pwr_level_80211h; + } else { /* has_cisco_pwr is always true here. */ + sdata_info(sdata, + "Limiting TX power to %d dBm as advertised by %pM\n", + pwr_level_cisco, sdata->u.mgd.bssid); + new_ap_level = pwr_level_cisco; + } if (sdata->ap_power_level == new_ap_level) return 0; @@ -2923,7 +2958,9 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI - * (00:50:F2) vendor IE which is used for WMM which we need to track. + * (00:50:F2) vendor IE which is used for WMM which we need to track, + * as well as the DTPC IE (part of the Cisco OUI) used for signaling + * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace @@ -3232,7 +3269,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, elems.country_elem, elems.country_elem_len, - elems.pwr_constr_elem); + elems.pwr_constr_elem, + elems.cisco_dtpc_elem); ieee80211_bss_info_change_notify(sdata, changed); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b444f27a788e..3c61060a4d2b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1015,6 +1015,31 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } elems->pwr_constr_elem = pos; break; + case WLAN_EID_CISCO_VENDOR_SPECIFIC: + /* Lots of different options exist, but we only care + * about the Dynamic Transmit Power Control element. + * First check for the Cisco OUI, then for the DTPC + * tag (0x00). + */ + if (elen < 4) { + elem_parse_failed = true; + break; + } + + if (pos[0] != 0x00 || pos[1] != 0x40 || + pos[2] != 0x96 || pos[3] != 0x00) + break; + + if (elen != 6) { + elem_parse_failed = true; + break; + } + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + elems->cisco_dtpc_elem = pos; + break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; -- cgit v1.2.3 From c16900cf285ca240f0f84117bf8b88a03c55469b Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:17:06 +0300 Subject: Bluetooth: Fix hci_conn reference counting for fixed channels Now that SMP has been converted to use fixed channels we've got a bit of a problem with the hci_conn reference counting. So far the L2CAP code has kept a reference for each L2CAP channel that was notified of the connection. With SMP however this would mean that the connection is never dropped even though there are no other users of it. Furthermore, SMP already does its own hci_conn reference counting internally, starting from a security or pairing request and ending with the key distribution. This patch makes L2CAP fixed channels default to the L2CAP core not keeping a hci_conn reference for them. A new FLAG_HOLD_HCI_CONN flag is added so that L2CAP users can declare an exception to this rule and hold a reference even for their fixed channels. One such exception is the L2CAP socket layer which does want a reference for each socket (e.g. an ATT socket which uses a fixed channel). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_core.c | 12 ++++++++++-- net/bluetooth/l2cap_sock.c | 8 ++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index cedda399f9c0..1558eccb19db 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -711,6 +711,7 @@ enum { FLAG_DEFER_SETUP, FLAG_LE_CONN_REQ_SENT, FLAG_PENDING_SECURITY, + FLAG_HOLD_HCI_CONN, }; enum { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 777c41dbfdbe..a6559225bb50 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -546,7 +546,10 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) l2cap_chan_hold(chan); - hci_conn_hold(conn->hcon); + /* Only keep a reference for fixed channels if they requested it */ + if (chan->chan_type != L2CAP_CHAN_FIXED || + test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) + hci_conn_hold(conn->hcon); list_add(&chan->list, &conn->chan_l); } @@ -577,7 +580,12 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) chan->conn = NULL; - if (chan->scid != L2CAP_CID_A2MP) + /* Reference was only held for non-fixed channels or + * fixed channels that explicitly requested it using the + * FLAG_HOLD_HCI_CONN flag. + */ + if (chan->chan_type != L2CAP_CHAN_FIXED || + test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_drop(conn->hcon); if (mgr && mgr->bredr_chan == chan) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ed06f88e6f10..31f106e61ca2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -146,6 +146,14 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) case L2CAP_CHAN_RAW: chan->sec_level = BT_SECURITY_SDP; break; + case L2CAP_CHAN_FIXED: + /* Fixed channels default to the L2CAP core not holding a + * hci_conn reference for them. For fixed channels mapping to + * L2CAP sockets we do want to hold a reference so set the + * appropriate flag to request it. + */ + set_bit(FLAG_HOLD_HCI_CONN, &chan->flags); + break; } bacpy(&chan->src, &la.l2_bdaddr); -- cgit v1.2.3 From 51bb8457ddfa74ede52bf8c02054dea831d59fff Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:57 +0300 Subject: Bluetooth: Improve *_get() functions to return the object type It's natural to have *_get() functions that increment the reference count of an object to return the object type itself. This way it's simple to make a copy of the object pointer and increase the reference count in a single step. This patch updates two such get() functions, namely hci_conn_get() and l2cap_conn_get(), and updates the users to take advantage of the new API. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 ++- include/net/bluetooth/l2cap.h | 2 +- net/bluetooth/hidp/core.c | 10 ++++------ net/bluetooth/l2cap_core.c | 6 +++--- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b0ded1333865..aa75eee8ad7f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -756,9 +756,10 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status); * _get()/_drop() in it, but require the caller to have a valid ref (FIXME). */ -static inline void hci_conn_get(struct hci_conn *conn) +static inline struct hci_conn *hci_conn_get(struct hci_conn *conn) { get_device(&conn->dev); + return conn; } static inline void hci_conn_put(struct hci_conn *conn) diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 1558eccb19db..8f1652ed3326 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -948,7 +948,7 @@ void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); void l2cap_conn_shutdown(struct l2cap_conn *conn, int err); -void l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 6c7ecf116e74..1b7d605706aa 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -915,7 +915,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, /* connection management */ bacpy(&session->bdaddr, bdaddr); - session->conn = conn; + session->conn = l2cap_conn_get(conn); session->user.probe = hidp_session_probe; session->user.remove = hidp_session_remove; session->ctrl_sock = ctrl_sock; @@ -941,13 +941,13 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, if (ret) goto err_free; - l2cap_conn_get(session->conn); get_file(session->intr_sock->file); get_file(session->ctrl_sock->file); *out = session; return 0; err_free: + l2cap_conn_put(session->conn); kfree(session); return ret; } @@ -1327,10 +1327,8 @@ int hidp_connection_add(struct hidp_connadd_req *req, conn = NULL; l2cap_chan_lock(chan); - if (chan->conn) { - l2cap_conn_get(chan->conn); - conn = chan->conn; - } + if (chan->conn) + conn = l2cap_conn_get(chan->conn); l2cap_chan_unlock(chan); if (!conn) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a6559225bb50..cb36169ef300 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1695,9 +1695,10 @@ static void l2cap_conn_free(struct kref *ref) kfree(conn); } -void l2cap_conn_get(struct l2cap_conn *conn) +struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn) { kref_get(&conn->ref); + return conn; } EXPORT_SYMBOL(l2cap_conn_get); @@ -6908,8 +6909,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) kref_init(&conn->ref); hcon->l2cap_data = conn; - conn->hcon = hcon; - hci_conn_get(conn->hcon); + conn->hcon = hci_conn_get(hcon); conn->hchan = hchan; BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); -- cgit v1.2.3 From eb78d7e53d144995b9e023b151de19fa40af72f3 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 00:41:41 +0300 Subject: Bluetooth: Use zero timeout for immediate scheduling There's no point in passing a "small" timeout to queue_delayed_work() to try to get the callback faster scheduled. Passing 0 is perfectly valid and will cause a shortcut to a direct queue_work(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index aa75eee8ad7f..18c24f6fce6c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -791,7 +791,7 @@ static inline void hci_conn_drop(struct hci_conn *conn) if (!conn->out) timeo *= 2; } else { - timeo = msecs_to_jiffies(10); + timeo = 0; } break; @@ -800,7 +800,7 @@ static inline void hci_conn_drop(struct hci_conn *conn) break; default: - timeo = msecs_to_jiffies(10); + timeo = 0; break; } -- cgit v1.2.3 From f94b665dcf15324f5ac8aa639e47be0829b6409d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 00:41:44 +0300 Subject: Bluetooth: Ignore incoming data after initiating disconnection When hci_chan_del is called the disconnection routines get scheduled through a workqueue. If there's any incoming ACL data before the routines get executed there's a chance that a new hci_chan is created and the disconnection never happens. This patch adds a new hci_conn flag to indicate that we're in the process of driving the connection down. We set the flag in hci_chan_del and check for it in hci_chan_create so that no new channels are created for the same connection. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_conn.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 18c24f6fce6c..dbe73642c54c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -553,6 +553,7 @@ enum { HCI_CONN_FIPS, HCI_CONN_STK_ENCRYPT, HCI_CONN_AUTH_INITIATOR, + HCI_CONN_DROP, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index cb04a4e3c829..aaa7e388d026 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1291,6 +1291,11 @@ struct hci_chan *hci_chan_create(struct hci_conn *conn) BT_DBG("%s hcon %p", hdev->name, conn); + if (test_bit(HCI_CONN_DROP, &conn->flags)) { + BT_DBG("Refusing to create new hci_chan"); + return NULL; + } + chan = kzalloc(sizeof(*chan), GFP_KERNEL); if (!chan) return NULL; @@ -1318,6 +1323,7 @@ void hci_chan_del(struct hci_chan *chan) /* Force the connection to be immediately dropped */ conn->disc_timeout = 0; + set_bit(HCI_CONN_DROP, &conn->flags); hci_conn_drop(conn); hci_conn_put(conn); -- cgit v1.2.3 From b04afa0c280b7e7ced88692251d75a78c8fcb2a7 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:30 +0300 Subject: Bluetooth: Remove unused l2cap_conn_shutdown API Now that there are no more users of the l2cap_conn_shutdown API (since smp.c switched to using hci_disconnect) we can simply remove it along with all of it's l2cap_conn variables. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 4 ---- net/bluetooth/l2cap_core.c | 25 ------------------------- 2 files changed, 29 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 8f1652ed3326..be25eddea615 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -625,9 +625,6 @@ struct l2cap_conn { struct delayed_work info_timer; - int disconn_err; - struct work_struct disconn_work; - struct sk_buff *rx_skb; __u32 rx_len; __u8 tx_ident; @@ -947,7 +944,6 @@ void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, u8 status); void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); -void l2cap_conn_shutdown(struct l2cap_conn *conn, int err); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2d550afe4322..2d9a2b58d2c8 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1635,9 +1635,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); - if (work_pending(&conn->disconn_work)) - cancel_work_sync(&conn->disconn_work); - l2cap_unregister_all_users(conn); /* Force the connection to be immediately dropped */ @@ -1670,26 +1667,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_conn_put(conn); } -static void disconn_work(struct work_struct *work) -{ - struct l2cap_conn *conn = container_of(work, struct l2cap_conn, - disconn_work); - - BT_DBG("conn %p", conn); - - l2cap_conn_del(conn->hcon, conn->disconn_err); -} - -void l2cap_conn_shutdown(struct l2cap_conn *conn, int err) -{ - struct hci_dev *hdev = conn->hcon->hdev; - - BT_DBG("conn %p err %d", conn, err); - - conn->disconn_err = err; - queue_work(hdev->workqueue, &conn->disconn_work); -} - static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); @@ -6943,8 +6920,6 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); - INIT_WORK(&conn->disconn_work, disconn_work); - skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); -- cgit v1.2.3 From e3b679d56caa2bc555dee646a6ac5861631e7a28 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:32 +0300 Subject: Bluetooth: Update hci_disconnect() to return an error value We'll soon use hci_disconnect() from places that are interested to know whether the hci_send_cmd() really succeeded or not. This patch updates hci_disconnect() to pass on any error returned from hci_send_cmd(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index dbe73642c54c..2b6e04d37593 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -703,7 +703,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, return NULL; } -void hci_disconnect(struct hci_conn *conn, __u8 reason); +int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5157a0990732..dd2df20b0f7d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -122,7 +122,7 @@ static void hci_reject_sco(struct hci_conn *conn) hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); } -void hci_disconnect(struct hci_conn *conn, __u8 reason) +int hci_disconnect(struct hci_conn *conn, __u8 reason) { struct hci_cp_disconnect cp; @@ -132,7 +132,7 @@ void hci_disconnect(struct hci_conn *conn, __u8 reason) cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; - hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); + return hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); } static void hci_amp_disconn(struct hci_conn *conn) -- cgit v1.2.3 From f3d82d0c8ec025fc113408e3ad5775fed5a060ff Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:50 +0300 Subject: Bluetooth: Move identity address update behind a workqueue The identity address update of all channels for an l2cap_conn needs to take the lock for each channel, i.e. it's safest to do this by a separate workqueue callback. Previously this was partially solved by moving the entire SMP key distribution behind a workqueue. However, if we want SMP context locking to be correct and safe we should always use the l2cap_chan lock when accessing it, meaning even smp_distribute_keys needs to take that lock which would once again create a dead lock when updating the identity address. The simplest way to solve this is to have l2cap_conn manage the deferred work which is what this patch does. A subsequent patch will remove the now unnecessary SMP key distribution work struct. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 3 ++- net/bluetooth/l2cap_core.c | 10 ++++++++-- net/bluetooth/smp.c | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index be25eddea615..ead99f032f7a 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -633,6 +633,8 @@ struct l2cap_conn { struct sk_buff_head pending_rx; struct work_struct pending_rx_work; + struct work_struct id_addr_update_work; + __u8 disc_reason; struct l2cap_chan *smp; @@ -937,7 +939,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan); void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); void l2cap_chan_del(struct l2cap_chan *chan, int err); -void l2cap_conn_update_id_addr(struct hci_conn *hcon); void l2cap_send_conn_req(struct l2cap_chan *chan); void l2cap_move_start(struct l2cap_chan *chan); void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ab405f0e53cb..b71430caab4a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -631,9 +631,11 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) } EXPORT_SYMBOL_GPL(l2cap_chan_del); -void l2cap_conn_update_id_addr(struct hci_conn *hcon) +static void l2cap_conn_update_id_addr(struct work_struct *work) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn = container_of(work, struct l2cap_conn, + id_addr_update_work); + struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; mutex_lock(&conn->chan_lock); @@ -1635,6 +1637,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + if (work_pending(&conn->id_addr_update_work)) + cancel_work_sync(&conn->id_addr_update_work); + l2cap_unregister_all_users(conn); /* Force the connection to be immediately dropped */ @@ -6927,6 +6932,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); + INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr); conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 9accb4739488..795c603bed30 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -654,7 +654,7 @@ static void smp_notify_keys(struct l2cap_conn *conn) */ bacpy(&hcon->dst, &smp->remote_irk->bdaddr); hcon->dst_type = smp->remote_irk->addr_type; - l2cap_conn_update_id_addr(hcon); + queue_work(hdev->workqueue, &conn->id_addr_update_work); /* When receiving an indentity resolving key for * a remote device that does not use a resolvable -- cgit v1.2.3 From fc75cc8684d21d3649b28c4c37d4ce3f000759e4 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:52 +0300 Subject: Bluetooth: Fix locking of the SMP context Before the move the l2cap_chan the SMP context (smp_chan) didn't have any kind of proper locking. The best there existed was the HCI_CONN_LE_SMP_PEND flag which was used to enable mutual exclusion for potential multiple creators of the SMP context. Now that SMP has been converted to use the l2cap_chan infrastructure and since the SMP context is directly mapped to a corresponding l2cap_chan we get the SMP context locking essentially for free through the l2cap_chan lock. For all callbacks that l2cap_core.c makes for each channel implementation (smp.c in the case of SMP) the l2cap_chan lock is held through l2cap_chan_lock(chan). Since the calls from l2cap_core.c to smp.c are covered the only missing piece to have the locking implemented properly is to ensure that the lock is held for any other call path that may access the SMP context. This means user responses through mgmt.c, requests to elevate the security of a connection through hci_conn.c, as well as any deferred work through workqueues. This patch adds the necessary locking to all these other code paths that try to access the SMP context. Since mutual exclusion for the l2cap_chan access is now covered from all directions the patch also removes unnecessary HCI_CONN_LE_SMP_PEND flag (once we've acquired the chan lock we can simply check whether chan->smp is set to know if there's an SMP context). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - net/bluetooth/smp.c | 76 +++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2b6e04d37593..045d9133d180 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -539,7 +539,6 @@ enum { HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, HCI_CONN_SCO_SETUP_PEND, - HCI_CONN_LE_SMP_PEND, HCI_CONN_MGMT_CONNECTED, HCI_CONN_SSP_ENABLED, HCI_CONN_SC_ENABLED, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 0b4403f3dce1..c71589f4b435 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -409,7 +409,6 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; - struct smp_chan *smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), @@ -419,12 +418,7 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) mgmt_auth_failed(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, HCI_ERROR_AUTH_FAILURE); - if (!chan->data) - return; - - smp = chan->data; - - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + if (chan->data) smp_chan_destroy(conn); } @@ -706,9 +700,6 @@ static void smp_distribute_keys(struct smp_chan *smp) BT_DBG("conn %p", conn); - if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return; - rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ @@ -801,7 +792,6 @@ static void smp_distribute_keys(struct smp_chan *smp) if ((smp->remote_key_dist & 0x07)) return; - clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); @@ -825,16 +815,13 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) struct smp_chan *smp; smp = kzalloc(sizeof(*smp), GFP_ATOMIC); - if (!smp) { - clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags); + if (!smp) return NULL; - } smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create ECB crypto context"); kfree(smp); - clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags); return NULL; } @@ -854,16 +841,23 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) struct l2cap_chan *chan; struct smp_chan *smp; u32 value; + int err; BT_DBG(""); - if (!conn || !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + if (!conn) return -ENOTCONN; chan = conn->smp; if (!chan) return -ENOTCONN; + l2cap_chan_lock(chan); + if (!chan->data) { + err = -ENOTCONN; + goto unlock; + } + smp = chan->data; switch (mgmt_op) { @@ -879,12 +873,16 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); - return 0; + err = 0; + goto unlock; default: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto unlock; } + err = 0; + /* If it is our turn to send Pairing Confirm, do so now */ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { u8 rsp = smp_confirm(smp); @@ -892,12 +890,15 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) smp_failure(conn, rsp); } - return 0; +unlock: + l2cap_chan_unlock(chan); + return err; } static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; + struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp; u8 key_size, auth, sec_level; @@ -911,12 +912,10 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (conn->hcon->role != HCI_ROLE_SLAVE) return SMP_CMD_NOTSUPP; - if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { + if (!chan->data) smp = smp_chan_create(conn); - } else { - struct l2cap_chan *chan = conn->smp; + else smp = chan->data; - } if (!smp) return SMP_UNSPECIFIED; @@ -1122,6 +1121,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_security_req *rp = (void *) skb->data; struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; + struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; u8 sec_level; @@ -1143,7 +1143,8 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + /* If SMP is already in progress ignore this request */ + if (chan->data) return 0; smp = smp_chan_create(conn); @@ -1170,8 +1171,10 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; __u8 authreq; + int ret; BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); @@ -1192,12 +1195,19 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; + l2cap_chan_lock(chan); + + /* If SMP is already in progress ignore this request */ + if (chan->data) { + ret = 0; + goto unlock; + } smp = smp_chan_create(conn); - if (!smp) - return 1; + if (!smp) { + ret = 1; + goto unlock; + } authreq = seclevel_to_authreq(sec_level); @@ -1223,8 +1233,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) } set_bit(SMP_FLAG_INITIATOR, &smp->flags); + ret = 0; - return 0; +unlock: + l2cap_chan_unlock(chan); + return ret; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) @@ -1315,7 +1328,6 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; bdaddr_t rpa; BT_DBG(""); @@ -1430,7 +1442,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) * returns an error). */ if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && - !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { + !chan->data) { BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); err = -EOPNOTSUPP; goto done; @@ -1504,7 +1516,7 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) BT_DBG("chan %p", chan); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + if (chan->data) smp_chan_destroy(conn); conn->smp = NULL; -- cgit v1.2.3 From a7f26b7e1ee73ac9e766c430fea5af658d839954 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Sep 2014 19:08:34 -0400 Subject: inet: remove dead inetpeer sequence code inetpeer sequence numbers are no longer incremented, so no need to check and flush the tree. The function that increments the sequence number was already dead code and removed in in "ipv4: remove unused function" (068a6e18). Remove the code that checks for a change, too. Verifying that v4_seq and v6_seq are never incremented and thus that flush_check compares bp->flush_seq to 0 is trivial. The second part of the change removes flush_check completely even though bp->flush_seq is exactly !0 once, at initialization. This change is correct because the time this branch is true is when bp->root == peer_avl_empty_rcu, in which the branch and inetpeer_invalidate_tree are a NOOP. Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inetpeer.h | 1 - net/ipv4/inetpeer.c | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'include') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 01d590ee5e7e..80479abddf73 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -61,7 +61,6 @@ struct inet_peer { struct inet_peer_base { struct inet_peer __rcu *root; seqlock_t lock; - u32 flush_seq; int total; }; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bd5f5928167d..241afd743d2c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -72,29 +72,10 @@ void inet_peer_base_init(struct inet_peer_base *bp) { bp->root = peer_avl_empty_rcu; seqlock_init(&bp->lock); - bp->flush_seq = ~0U; bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); -static atomic_t v4_seq = ATOMIC_INIT(0); -static atomic_t v6_seq = ATOMIC_INIT(0); - -static atomic_t *inetpeer_seq_ptr(int family) -{ - return (family == AF_INET ? &v4_seq : &v6_seq); -} - -static inline void flush_check(struct inet_peer_base *base, int family) -{ - atomic_t *fp = inetpeer_seq_ptr(family); - - if (unlikely(base->flush_seq != atomic_read(fp))) { - inetpeer_invalidate_tree(base); - base->flush_seq = atomic_read(fp); - } -} - #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ /* Exported for sysctl_net_ipv4. */ @@ -444,8 +425,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, unsigned int sequence; int invalidated, gccnt = 0; - flush_check(base, daddr->family); - /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ -- cgit v1.2.3 From e1e930f591bfd9604c3077f0af5c390f4f890259 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 8 Sep 2014 17:09:49 -0700 Subject: Bluetooth: Fix mgmt pairing failure when authentication fails Whether through HCI with BR/EDR or SMP with LE when authentication fails we should also notify any pending Pair Device mgmt command. This patch updates the mgmt_auth_failed function to take the actual hci_conn object and makes sure that any pending pairing command is notified and cleaned up appropriately. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +-- net/bluetooth/hci_event.c | 6 ++---- net/bluetooth/mgmt.c | 19 +++++++++++++------ net/bluetooth/smp.c | 3 +-- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 045d9133d180..206b92bfeebb 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1342,8 +1342,7 @@ int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u32 passkey, u8 entered); -void mgmt_auth_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status); +void mgmt_auth_failed(struct hci_conn *conn, u8 status); void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); void mgmt_sc_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e6a496ae0318..3a8381ab992f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2320,8 +2320,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->sec_level = conn->pending_sec_level; } } else { - mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, - ev->status); + mgmt_auth_failed(conn, ev->status); } clear_bit(HCI_CONN_AUTH_PEND, &conn->flags); @@ -3900,8 +3899,7 @@ static void hci_simple_pair_complete_evt(struct hci_dev *hdev, * event gets always produced as initiator and is also mapped to * the mgmt_auth_failed event */ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && ev->status) - mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, - ev->status); + mgmt_auth_failed(conn, ev->status); hci_conn_drop(conn); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index ab9521ae3c63..efb71b022ab6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6485,16 +6485,23 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, return mgmt_event(MGMT_EV_PASSKEY_NOTIFY, hdev, &ev, sizeof(ev), NULL); } -void mgmt_auth_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status) +void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status) { struct mgmt_ev_auth_failed ev; + struct pending_cmd *cmd; + u8 status = mgmt_status(hci_status); - bacpy(&ev.addr.bdaddr, bdaddr); - ev.addr.type = link_to_bdaddr(link_type, addr_type); - ev.status = mgmt_status(status); + bacpy(&ev.addr.bdaddr, &conn->dst); + ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type); + ev.status = status; - mgmt_event(MGMT_EV_AUTH_FAILED, hdev, &ev, sizeof(ev), NULL); + cmd = find_pairing(conn); + + mgmt_event(MGMT_EV_AUTH_FAILED, conn->hdev, &ev, sizeof(ev), + cmd ? cmd->sk : NULL); + + if (cmd) + pairing_complete(cmd, status); } void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 560f78a9f960..25c9040e0b12 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -420,8 +420,7 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) &reason); clear_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags); - mgmt_auth_failed(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, - HCI_ERROR_AUTH_FAILURE); + mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); if (chan->data) smp_chan_destroy(conn); -- cgit v1.2.3 From 2a5538e9aa4929329813bee69922c9ae4990fcad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Aug 2014 12:05:27 +0200 Subject: netfilter: nat: move specific NAT IPv6 to core Move the specific NAT IPv6 core functions that are called from the hooks from ip6table_nat.c to nf_nat_l3proto_ipv6.c. This prepares the ground to allow iptables and nft to use the same NAT engine code that comes in a follow up patch. This also renames nf_nat_ipv6_fn to nft_nat_ipv6_fn in net/ipv6/netfilter/nft_chain_nat_ipv6.c to avoid a compilation breakage. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l3proto.h | 37 +++++ net/ipv6/netfilter/ip6table_nat.c | 233 +++++-------------------------- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 199 ++++++++++++++++++++++++++ net/ipv6/netfilter/nft_chain_nat_ipv6.c | 10 +- 4 files changed, 275 insertions(+), 204 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index bc2d51574489..340c013795a4 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -84,4 +84,41 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); +unsigned int nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + #endif /* _NF_NAT_L3PROTO_H */ diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 387d8b8fc18d..b0634ac996b7 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -30,222 +30,57 @@ static const struct xt_table nf_nat_ipv6_table = { .af = NFPROTO_IPV6, }; -static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI6)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6 : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) +static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int ret; - ret = ip6t_do_table(skb, hooknum, in, out, net->ipv6.ip6table_nat); - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - ret = alloc_null_binding(ct, hooknum); - } - return ret; + return ip6t_do_table(skb, ops->hooknum, in, out, net->ipv6.ip6table_nat); } -static unsigned int -nf_nat_ipv6_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - __be16 frag_off; - int hdrlen; - u8 nexthdr; - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - nexthdr = ipv6_hdr(skb)->nexthdr; - hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, - hdrlen)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_ipv6_fn(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - unsigned int ret; - struct in6_addr daddr = ipv6_hdr(skb)->daddr; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv6_in(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_out(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET6); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; + return nf_nat_ipv6_out(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, - &ct->tuplehash[!dir].tuple.src.u3)) { - err = ip6_route_me_harder(skb); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET6); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; + return nf_nat_ipv6_local_fn(ops, skb, in, out, ip6table_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv6_in, + .hook = ip6table_nat_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, @@ -253,7 +88,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_out, + .hook = ip6table_nat_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, @@ -261,7 +96,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv6_local_fn, + .hook = ip6table_nat_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, @@ -269,7 +104,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_fn, + .hook = ip6table_nat_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index fc8e49b2ff3e..c5812e1c1ffb 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -261,6 +261,205 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); +unsigned int +nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = do_chain(ops, skb, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + + if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + break; + + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); + +unsigned int +nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + unsigned int ret; + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); + +unsigned int +nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); + +unsigned int +nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn); + static int __init nf_nat_l3proto_ipv6_init(void) { int err; diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index d189fcb437fe..c1c74491a10b 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -28,7 +28,7 @@ * IPv6 NAT chains */ -static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -97,7 +97,7 @@ static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops, struct in6_addr daddr = ipv6_hdr(skb)->daddr; unsigned int ret; - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -115,7 +115,7 @@ static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops, const struct nf_conn *ct __maybe_unused; unsigned int ret; - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -143,7 +143,7 @@ static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, const struct nf_conn *ct; unsigned int ret; - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -177,7 +177,7 @@ static const struct nf_chain_type nft_chain_nat_ipv6 = { [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, - [NF_INET_LOCAL_IN] = nf_nat_ipv6_fn, + [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, }, }; -- cgit v1.2.3 From 3045d76070abe725dbb7fd8ff39c27b820d5a7eb Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Tue, 2 Sep 2014 20:36:14 +0200 Subject: netfilter: nf_tables: add devgroup support in meta expresion Add devgroup support to let us match device group of a packets incoming or outgoing interface. Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ net/netfilter/nft_meta.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c9b6f00a3fb7..c000947d3f38 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -573,6 +573,8 @@ enum nft_exthdr_attributes { * @NFT_META_BRI_OIFNAME: packet output bridge interface name * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback * @NFT_META_CPU: cpu id through smp_processor_id() + * @NFT_META_IIFGROUP: packet input interface group + * @NFT_META_OIFGROUP: packet output interface group */ enum nft_meta_keys { NFT_META_LEN, @@ -596,6 +598,8 @@ enum nft_meta_keys { NFT_META_BRI_OIFNAME, NFT_META_PKTTYPE, NFT_META_CPU, + NFT_META_IIFGROUP, + NFT_META_OIFGROUP, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 843e099a962d..1e7c076ca63a 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -155,6 +155,16 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_CPU: dest->data[0] = smp_processor_id(); break; + case NFT_META_IIFGROUP: + if (in == NULL) + goto err; + dest->data[0] = in->group; + break; + case NFT_META_OIFGROUP: + if (out == NULL) + goto err; + dest->data[0] = out->group; + break; default: WARN_ON(1); goto err; @@ -228,6 +238,8 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #endif case NFT_META_PKTTYPE: case NFT_META_CPU: + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From e42eff8a32f8b7bde88ea3c5a56391407cbe84f3 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:14 +0200 Subject: netfilter: nft_nat: include a flag attribute Both SNAT and DNAT (and the upcoming masquerade) can have additional configuration parameters, such as port randomization and NAT addressing persistence. We can cover these scenarios by simply adding a flag attribute for userspace to fill when needed. The flags to use are defined in include/uapi/linux/netfilter/nf_nat.h: NF_NAT_RANGE_MAP_IPS NF_NAT_RANGE_PROTO_SPECIFIED NF_NAT_RANGE_PROTO_RANDOM NF_NAT_RANGE_PERSISTENT NF_NAT_RANGE_PROTO_RANDOM_FULLY NF_NAT_RANGE_PROTO_RANDOM_ALL The caller must take care of not messing up with the flags, as they are added unconditionally to the final resulting nf_nat_range. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_nat.h | 5 +++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_nat.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index 1ad3659102b6..0880781ad7b6 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -13,6 +13,11 @@ #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) +#define NF_NAT_RANGE_MASK \ + (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ + NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ + NF_NAT_RANGE_PROTO_RANDOM_FULLY) + struct nf_nat_ipv4_range { unsigned int flags; __be32 min_ip; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c000947d3f38..6022c6e6be18 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -785,6 +785,7 @@ enum nft_nat_types { * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers) * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_NAT_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) */ enum nft_nat_attributes { NFTA_NAT_UNSPEC, @@ -794,6 +795,7 @@ enum nft_nat_attributes { NFTA_NAT_REG_ADDR_MAX, NFTA_NAT_REG_PROTO_MIN, NFTA_NAT_REG_PROTO_MAX, + NFTA_NAT_FLAGS, __NFTA_NAT_MAX }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 79ff58cd36dc..799550b476fb 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -33,6 +33,7 @@ struct nft_nat { enum nft_registers sreg_proto_max:8; enum nf_nat_manip_type type:8; u8 family; + u16 flags; }; static void nft_nat_eval(const struct nft_expr *expr, @@ -71,6 +72,8 @@ static void nft_nat_eval(const struct nft_expr *expr, range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } + range.flags |= priv->flags; + data[NFT_REG_VERDICT].verdict = nf_nat_setup_info(ct, &range, priv->type); } @@ -82,6 +85,7 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, }; static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -149,6 +153,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else priv->sreg_proto_max = priv->sreg_proto_min; + if (tb[NFTA_NAT_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + return 0; } @@ -183,6 +193,12 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) htonl(priv->sreg_proto_max))) goto nla_put_failure; } + + if (priv->flags != 0) { + if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + } + return 0; nla_put_failure: -- cgit v1.2.3 From 8dd33cc93ec92b8460ed2ad98c6db39276f6a72b Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:33 +0200 Subject: netfilter: nf_nat: generalize IPv4 masquerading support for nf_tables Let's refactor the code so we can reach the masquerade functionality from outside the xt context (ie. nftables). The patch includes the addition of an atomic counter to the masquerade notifier: the stuff to be done by the notifier is the same for xt and nftables. Therefore, only one notification handler is needed. This factorization only involves IPv4; a similar patch follows to handle IPv6. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_masquerade.h | 14 +++ net/ipv4/netfilter/Kconfig | 7 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_MASQUERADE.c | 108 ++--------------- net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 153 +++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 99 deletions(-) create mode 100644 include/net/netfilter/ipv4/nf_nat_masquerade.h create mode 100644 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h new file mode 100644 index 000000000000..a9c001c646da --- /dev/null +++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -0,0 +1,14 @@ +#ifndef _NF_NAT_MASQUERADE_IPV4_H_ +#define _NF_NAT_MASQUERADE_IPV4_H_ + +#include + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range *range, + const struct net_device *out); + +void nf_nat_masquerade_ipv4_register_notifier(void); +void nf_nat_masquerade_ipv4_unregister_notifier(void); + +#endif /*_NF_NAT_MASQUERADE_IPV4_H_ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fb173126f03d..4be3e541350e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -184,8 +184,15 @@ config NF_NAT_IPV4 if NF_NAT_IPV4 +config NF_NAT_MASQUERADE_IPV4 + tristate "IPv4 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection). + config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" + select NF_NAT_MASQUERADE_IPV4 default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 33001621465b..42056b2fd0e3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 00352ce0f0de..da7f02a0b868 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -22,6 +22,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -46,103 +47,17 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_conn *ct; - struct nf_conn_nat *nat; - enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; + struct nf_nat_range range; const struct nf_nat_ipv4_multi_range_compat *mr; - const struct rtable *rt; - __be32 newsrc, nh; - - NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); - - ct = nf_ct_get(skb, &ctinfo); - nat = nfct_nat(ct); - - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - - /* Source address is 0.0.0.0 - locally generated packet that is - * probably not supposed to be masqueraded. - */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) - return NF_ACCEPT; mr = par->targinfo; - rt = skb_rtable(skb); - nh = rt_nexthop(rt, ip_hdr(skb)->daddr); - newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); - if (!newsrc) { - pr_info("%s ate my IP address\n", par->out->name); - return NF_DROP; - } - - nat->masq_index = par->out->ifindex; - - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newsrc; - newrange.max_addr.ip = newsrc; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; + range.flags = mr->range[0].flags; + range.min_proto = mr->range[0].min; + range.max_proto = mr->range[0].max; - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); + return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); } -static int -device_cmp(struct nf_conn *i, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - if (nf_ct_l3num(i) != NFPROTO_IPV4) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) { - /* Device was downed. Search entire table for - conntracks which were associated with that device, - and forget them. */ - NF_CT_ASSERT(dev->ifindex != 0); - - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - } - - return NOTIFY_DONE; -} - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - struct netdev_notifier_info info; - - netdev_notifier_info_init(&info, dev); - return masq_device_event(this, event, &info); -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV4, @@ -160,12 +75,8 @@ static int __init masquerade_tg_init(void) ret = xt_register_target(&masquerade_tg_reg); - if (ret == 0) { - /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); - /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); - } + if (ret == 0) + nf_nat_masquerade_ipv4_register_notifier(); return ret; } @@ -173,8 +84,7 @@ static int __init masquerade_tg_init(void) static void __exit masquerade_tg_exit(void) { xt_unregister_target(&masquerade_tg_reg); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); + nf_nat_masquerade_ipv4_unregister_notifier(); } module_init(masquerade_tg_init); diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c new file mode 100644 index 000000000000..c6eb42100e9a --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -0,0 +1,153 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range *range, + const struct net_device *out) +{ + struct nf_conn *ct; + struct nf_conn_nat *nat; + enum ip_conntrack_info ctinfo; + struct nf_nat_range newrange; + const struct rtable *rt; + __be32 newsrc, nh; + + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + + ct = nf_ct_get(skb, &ctinfo); + nat = nfct_nat(ct); + + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + /* Source address is 0.0.0.0 - locally generated packet that is + * probably not supposed to be masqueraded. + */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) + return NF_ACCEPT; + + rt = skb_rtable(skb); + nh = rt_nexthop(rt, ip_hdr(skb)->daddr); + newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); + if (!newsrc) { + pr_info("%s ate my IP address\n", out->name); + return NF_DROP; + } + + nat->masq_index = out->ifindex; + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newsrc; + newrange.max_addr.ip = newsrc; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); + +static int device_cmp(struct nf_conn *i, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + if (nf_ct_l3num(i) != NFPROTO_IPV4) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + * conntracks which were associated with that device, + * and forget them. + */ + NF_CT_ASSERT(dev->ifindex != 0); + + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); + +void nf_nat_masquerade_ipv4_register_notifier(void) +{ + /* check if the notifier was already set */ + if (atomic_inc_return(&masquerade_notifier_refcount) > 1) + return; + + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); + +void nf_nat_masquerade_ipv4_unregister_notifier(void) +{ + /* check if the notifier still has clients */ + if (atomic_dec_return(&masquerade_notifier_refcount) > 0) + return; + + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); -- cgit v1.2.3 From be6b635cd674add9410efa9ac6f03e0040848b12 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:49 +0200 Subject: netfilter: nf_nat: generalize IPv6 masquerading support for nf_tables Let's refactor the code so we can reach the masquerade functionality from outside the xt context (ie. nftables). The patch includes the addition of an atomic counter to the masquerade notifier: the stuff to be done by the notifier is the same for xt and nftables. Therefore, only one notification handler is needed. This factorization only involves IPv6; a similar patch exists to handle IPv4. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_nat_masquerade.h | 10 +++ net/ipv6/netfilter/Kconfig | 7 ++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/ip6t_MASQUERADE.c | 76 ++-------------- net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 120 +++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 71 deletions(-) create mode 100644 include/net/netfilter/ipv6/nf_nat_masquerade.h create mode 100644 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c (limited to 'include') diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h new file mode 100644 index 000000000000..0a13396cd390 --- /dev/null +++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -0,0 +1,10 @@ +#ifndef _NF_NAT_MASQUERADE_IPV6_H_ +#define _NF_NAT_MASQUERADE_IPV6_H_ + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + const struct net_device *out); +void nf_nat_masquerade_ipv6_register_notifier(void); +void nf_nat_masquerade_ipv6_unregister_notifier(void); + +#endif /* _NF_NAT_MASQUERADE_IPV6_H_ */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ac93df16f5af..6c8cfec6836a 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -246,8 +246,15 @@ config NF_NAT_IPV6 if NF_NAT_IPV6 +config NF_NAT_MASQUERADE_IPV6 + tristate "IPv6 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection) for IPv6. + config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" + select NF_NAT_MASQUERADE_IPV6 help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c0b263104ed2..89a0bd751f82 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o +obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o # defrag nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 3e4e92d5e157..7f9f45d829d2 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -19,33 +19,12 @@ #include #include #include +#include static unsigned int masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; - enum ip_conntrack_info ctinfo; - struct in6_addr src; - struct nf_conn *ct; - struct nf_nat_range newrange; - - ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - - if (ipv6_dev_get_saddr(dev_net(par->out), par->out, - &ipv6_hdr(skb)->daddr, 0, &src) < 0) - return NF_DROP; - - nfct_nat(ct)->masq_index = par->out->ifindex; - - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = src; - newrange.max_addr.in6 = src; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); + return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out); } static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) @@ -57,48 +36,6 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) return 0; } -static int device_cmp(struct nf_conn *ct, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(ct); - - if (!nat) - return 0; - if (nf_ct_l3num(ct) != NFPROTO_IPV6) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - - return NOTIFY_DONE; -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; - - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); -} - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, @@ -115,17 +52,14 @@ static int __init masquerade_tg6_init(void) int err; err = xt_register_target(&masquerade_tg6_reg); - if (err == 0) { - register_netdevice_notifier(&masq_dev_notifier); - register_inet6addr_notifier(&masq_inet_notifier); - } + if (err == 0) + nf_nat_masquerade_ipv6_register_notifier(); return err; } static void __exit masquerade_tg6_exit(void) { - unregister_inet6addr_notifier(&masq_inet_notifier); - unregister_netdevice_notifier(&masq_dev_notifier); + nf_nat_masquerade_ipv6_unregister_notifier(); xt_unregister_target(&masquerade_tg6_reg); } diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c new file mode 100644 index 000000000000..7745609665cd --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + const struct net_device *out) +{ + enum ip_conntrack_info ctinfo; + struct in6_addr src; + struct nf_conn *ct; + struct nf_nat_range newrange; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + if (ipv6_dev_get_saddr(dev_net(out), out, + &ipv6_hdr(skb)->daddr, 0, &src) < 0) + return NF_DROP; + + nfct_nat(ct)->masq_index = out->ifindex; + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = src; + newrange.max_addr.in6 = src; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); + +static int device_cmp(struct nf_conn *ct, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); + +void nf_nat_masquerade_ipv6_register_notifier(void) +{ + /* check if the notifier is already set */ + if (atomic_inc_return(&masquerade_notifier_refcount) > 1) + return; + + register_netdevice_notifier(&masq_dev_notifier); + register_inet6addr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); + +void nf_nat_masquerade_ipv6_unregister_notifier(void) +{ + /* check if the notifier still has clients */ + if (atomic_dec_return(&masquerade_notifier_refcount) > 0) + return; + + unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_netdevice_notifier(&masq_dev_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); -- cgit v1.2.3 From 9ba1f726bec090399eb9bb9157eb32dedc8e8c45 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Mon, 8 Sep 2014 13:45:00 +0200 Subject: netfilter: nf_tables: add new nft_masq expression The nft_masq expression is intended to perform NAT in the masquerade flavour. We decided to have the masquerade functionality in a separated expression other than nft_nat. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_masq.h | 16 ++++++ include/uapi/linux/netfilter/nf_tables.h | 11 ++++ net/ipv4/netfilter/Kconfig | 6 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_masq_ipv4.c | 89 ++++++++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 +++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 89 ++++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 9 ++++ net/netfilter/Makefile | 1 + net/netfilter/nft_masq.c | 59 +++++++++++++++++++++ 11 files changed, 288 insertions(+) create mode 100644 include/net/netfilter/nft_masq.h create mode 100644 net/ipv4/netfilter/nft_masq_ipv4.c create mode 100644 net/ipv6/netfilter/nft_masq_ipv6.c create mode 100644 net/netfilter/nft_masq.c (limited to 'include') diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h new file mode 100644 index 000000000000..c72729f954f4 --- /dev/null +++ b/include/net/netfilter/nft_masq.h @@ -0,0 +1,16 @@ +#ifndef _NFT_MASQ_H_ +#define _NFT_MASQ_H_ + +struct nft_masq { + u32 flags; +}; + +extern const struct nla_policy nft_masq_policy[]; + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr); + +#endif /* _NFT_MASQ_H_ */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6022c6e6be18..eeec0ae845ef 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -800,4 +800,15 @@ enum nft_nat_attributes { }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) +/** + * enum nft_masq_attributes - nf_tables masquerade expression attributes + * + * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_masq_attributes { + NFTA_MASQ_FLAGS, + __NFTA_MASQ_MAX +}; +#define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4be3e541350e..8dd3d9f19d82 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -190,6 +190,12 @@ config NF_NAT_MASQUERADE_IPV4 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection). +config NFT_MASQ_IPV4 + tristate "IPv4 masquerading support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV4 + config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV4 diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 42056b2fd0e3..7d019aefb0ed 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c new file mode 100644 index 000000000000..6ea1d207b6a5 --- /dev/null +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_masq_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static int nft_masq_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + int err; + + err = nft_masq_init(ctx, expr, tb); + if (err < 0) + return err; + + nf_nat_masquerade_ipv4_register_notifier(); + return 0; +} + +static void nft_masq_ipv4_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_nat_masquerade_ipv4_unregister_notifier(); +} + +static struct nft_expr_type nft_masq_ipv4_type; +static const struct nft_expr_ops nft_masq_ipv4_ops = { + .type = &nft_masq_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv4_eval, + .init = nft_masq_ipv4_init, + .destroy = nft_masq_ipv4_destroy, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "masq", + .ops = &nft_masq_ipv4_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv4_module_init(void) +{ + return nft_register_expr(&nft_masq_ipv4_type); +} + +static void __exit nft_masq_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv4_type); +} + +module_init(nft_masq_ipv4_module_init); +module_exit(nft_masq_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6c8cfec6836a..24c535f66df0 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -252,6 +252,12 @@ config NF_NAT_MASQUERADE_IPV6 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection) for IPv6. +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 89a0bd751f82..482c4dff273f 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o +obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c new file mode 100644 index 000000000000..4e51334ef6b7 --- /dev/null +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_masq_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static int nft_masq_ipv6_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + int err; + + err = nft_masq_init(ctx, expr, tb); + if (err < 0) + return err; + + nf_nat_masquerade_ipv6_register_notifier(); + return 0; +} + +static void nft_masq_ipv6_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_nat_masquerade_ipv6_unregister_notifier(); +} + +static struct nft_expr_type nft_masq_ipv6_type; +static const struct nft_expr_ops nft_masq_ipv6_ops = { + .type = &nft_masq_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv6_eval, + .init = nft_masq_ipv6_init, + .destroy = nft_masq_ipv6_destroy, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "masq", + .ops = &nft_masq_ipv6_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv6_module_init(void) +{ + return nft_register_expr(&nft_masq_ipv6_type); +} + +static void __exit nft_masq_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv6_type); +} + +module_init(nft_masq_ipv6_module_init); +module_exit(nft_masq_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ad751fe2e82b..37428723394f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -496,6 +496,15 @@ config NFT_LIMIT This option adds the "limit" expression that you can use to ratelimit rule matchings. +config NFT_MASQ + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables masquerade support" + help + This option adds the "masquerade" expression that you can use + to perform NAT in the masquerade flavour. + config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 8308624a406a..0637792f6faf 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o +obj-$(CONFIG_NFT_MASQ) += nft_masq.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c new file mode 100644 index 000000000000..6637bab00567 --- /dev/null +++ b/net/netfilter/nft_masq.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_masq_policy); + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_masq *priv = nft_expr_priv(expr); + + if (tb[NFTA_MASQ_FLAGS] == NULL) + return 0; + + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_masq_init); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_masq *priv = nft_expr_priv(expr); + + if (priv->flags == 0) + return 0; + + if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_masq_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); -- cgit v1.2.3 From 02ab695bb37ee9ad515df0d0790d5977505dd04a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:17 -0700 Subject: net: filter: add "load 64-bit immediate" eBPF instruction add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register. All previous instructions were 8-byte. This is first 16-byte instruction. Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction: insn[0].code = BPF_LD | BPF_DW | BPF_IMM insn[0].dst_reg = destination register insn[0].imm = lower 32-bit insn[1].code = 0 insn[1].imm = upper 32-bit All unused fields must be zero. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. x64 JITs it as single 'movabsq %rax, imm64' arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn Note that old eBPF programs are binary compatible with new interpreter. It helps eBPF programs load 64-bit constant into a register with one instruction instead of using two registers and 4 instructions: BPF_MOV32_IMM(R1, imm32) BPF_ALU64_IMM(BPF_LSH, R1, 32) BPF_MOV32_IMM(R2, imm32) BPF_ALU64_REG(BPF_OR, R1, R2) User space generated programs will use this instruction to load constants only. To tell kernel that user space needs a pointer the _pseudo_ variant of this instruction may be added later, which will use extra bits of encoding to indicate what type of pointer user space is asking kernel to provide. For example 'off' or 'src_reg' fields can be used for such purpose. src_reg = 1 could mean that user space is asking kernel to validate and load in-kernel map pointer. src_reg = 2 could mean that user space needs readonly data section pointer src_reg = 3 could mean that user space needs a pointer to per-cpu local data All such future pseudo instructions will not be carrying the actual pointer as part of the instruction, but rather will be treated as a request to kernel to provide one. The kernel will verify the request_for_a_pointer, then will drop _pseudo_ marking and will store actual internal pointer inside the instruction, so the end result is the interpreter and JITs never see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that loads 64-bit immediate into a register. User space never operates on direct pointers and verifier can easily recognize request_for_pointer vs other instructions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 8 +++++++- arch/x86/net/bpf_jit_comp.c | 17 +++++++++++++++++ include/linux/filter.h | 18 ++++++++++++++++++ kernel/bpf/core.c | 5 +++++ lib/test_bpf.c | 21 +++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index c48a9704bda8..81916ab5d96f 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -951,7 +951,7 @@ Size modifier is one of ... Mode modifier is one of: - BPF_IMM 0x00 /* classic BPF only, reserved in eBPF */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ BPF_ABS 0x20 BPF_IND 0x40 BPF_MEM 0x60 @@ -995,6 +995,12 @@ BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and 2 byte atomic increments are not supported. +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + Testing ------- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 39ccfbb4a723..06f8c17f5484 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -393,6 +393,23 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); break; + case BPF_LD | BPF_IMM | BPF_DW: + if (insn[1].code != 0 || insn[1].src_reg != 0 || + insn[1].dst_reg != 0 || insn[1].off != 0) { + /* verifier must catch invalid insns */ + pr_err("invalid BPF_LD_IMM64 insn\n"); + return -EINVAL; + } + + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(insn[0].imm, 4); + EMIT(insn[1].imm, 4); + + insn++; + i++; + break; + /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */ case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_DIV | BPF_X: diff --git a/include/linux/filter.h b/include/linux/filter.h index c78994593355..bf323da77950 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -166,6 +166,24 @@ enum { .off = 0, \ .imm = IMM }) +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b54bb2c2e494..2c2bfaacce66 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -242,6 +242,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; void *ptr; int off; @@ -301,6 +302,10 @@ select_insn: ALU64_MOV_K: DST = IMM; CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9a67456ba29a..413890815d3e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1735,6 +1735,27 @@ static struct bpf_test tests[] = { { }, { { 1, 0 } }, }, + { + "load 64-bit immediate", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x567800001234L), + BPF_MOV64_REG(R2, R1), + BPF_MOV64_REG(R3, R2), + BPF_ALU64_IMM(BPF_RSH, R2, 32), + BPF_ALU64_IMM(BPF_LSH, R3, 32), + BPF_ALU64_IMM(BPF_RSH, R3, 32), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_JMP_IMM(BPF_JEQ, R2, 0x5678, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, R3, 0x1234, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } } + }, }; static struct net_device dev; -- cgit v1.2.3 From daedfb22451dd02b35c0549566cbb7cc06bdd53b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:18 -0700 Subject: net: filter: split filter.h and expose eBPF to user space allow user space to generate eBPF programs uapi/linux/bpf.h: eBPF instruction set definition linux/filter.h: the rest This patch only moves macro definitions, but practically it freezes existing eBPF instruction set, though new instructions can still be added in the future. These eBPF definitions cannot go into uapi/linux/filter.h, since the names may conflict with existing applications. Full eBPF ISA description is in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 56 +--------------------------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 55 deletions(-) create mode 100644 include/uapi/linux/bpf.h (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index bf323da77950..8f82ef3f1cdd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -10,58 +10,12 @@ #include #include #include +#include struct sk_buff; struct sock; struct seccomp_data; -/* Internally used and optimized filter representation with extended - * instruction set based on top of classic BPF. - */ - -/* instruction classes */ -#define BPF_ALU64 0x07 /* alu mode in double word width */ - -/* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ -#define BPF_XADD 0xc0 /* exclusive add */ - -/* alu/jmp fields */ -#define BPF_MOV 0xb0 /* mov reg to reg */ -#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ - -/* change endianness of a register */ -#define BPF_END 0xd0 /* flags for endianness conversion: */ -#define BPF_TO_LE 0x00 /* convert to little-endian */ -#define BPF_TO_BE 0x08 /* convert to big-endian */ -#define BPF_FROM_LE BPF_TO_LE -#define BPF_FROM_BE BPF_TO_BE - -#define BPF_JNE 0x50 /* jump != */ -#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ -#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ -#define BPF_CALL 0x80 /* function call */ -#define BPF_EXIT 0x90 /* function return */ - -/* Register numbers */ -enum { - BPF_REG_0 = 0, - BPF_REG_1, - BPF_REG_2, - BPF_REG_3, - BPF_REG_4, - BPF_REG_5, - BPF_REG_6, - BPF_REG_7, - BPF_REG_8, - BPF_REG_9, - BPF_REG_10, - __MAX_BPF_REG, -}; - -/* BPF has 10 general purpose 64-bit registers and stack frame. */ -#define MAX_BPF_REG __MAX_BPF_REG - /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. @@ -322,14 +276,6 @@ enum { #define SK_RUN_FILTER(filter, ctx) \ (*filter->prog->bpf_func)(ctx, filter->prog->insnsi) -struct bpf_insn { - __u8 code; /* opcode */ - __u8 dst_reg:4; /* dest register */ - __u8 src_reg:4; /* source register */ - __s16 off; /* signed offset */ - __s32 imm; /* signed immediate constant */ -}; - #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 24e9033f8b3f..fb3f7b675229 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -67,6 +67,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += bpf.h header-y += bpqether.h header-y += bsg.h header-y += btrfs.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h new file mode 100644 index 000000000000..479ed0b6be16 --- /dev/null +++ b/include/uapi/linux/bpf.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word */ +#define BPF_XADD 0xc0 /* exclusive add */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +#endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 49a601589caaf0e93194c0cc9b4ecddbe75dd2d5 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Fri, 5 Sep 2014 15:09:03 +0200 Subject: net/ipv4: bind ip_nonlocal_bind to current netns net.ipv4.ip_nonlocal_bind sysctl was global to all network namespaces. This patch allows to set a different value for each network namespace. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/ip.h | 2 -- include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 6 +----- net/ipv4/ping.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv6/af_inet6.c | 2 +- net/sctp/protocol.c | 2 +- 7 files changed, 12 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index c8fd6112bd0b..14bfc8e1bcf9 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -229,8 +229,6 @@ static inline int inet_is_local_reserved_port(struct net *net, int port) } #endif -extern int sysctl_ip_nonlocal_bind; - /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index aec5e12f9f19..24945cefc4fd 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,6 +76,7 @@ struct netns_ipv4 { int sysctl_tcp_ecn; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_nonlocal_bind; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d156b3c5f363..b537bd94906c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -418,10 +418,6 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -/* It is off by default, see below. */ -int sysctl_ip_nonlocal_bind __read_mostly; -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index a3c59a077a5f..57f7c9804139 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((sysctl_ip_nonlocal_bind == 0 && + if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && isk->freebind == 0 && isk->transparent == 0 && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 45d156dacd61..1599966f4639 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -285,13 +285,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, - { - .procname = "ip_nonlocal_bind", - .data = &sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_syn_retries", .data = &sysctl_tcp_syn_retries, @@ -848,6 +841,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b9393e6a21fe..e4865a3ebe1d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -302,7 +302,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6240834f4b95..9d2c6c9facb6 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -366,7 +366,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && - !sysctl_ip_nonlocal_bind) + !net->ipv4.sysctl_ip_nonlocal_bind) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) -- cgit v1.2.3 From e5c3ea5c668033b303e7ac835d7d91da32d97958 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:31 +0200 Subject: bridge: implement rtnl_link_ops->get_size and rtnl_link_ops->fill_info Allow rtnetlink users to get bridge master info in IFLA_INFO_DATA attr This initial part implements forward_delay, hello_time, max_age options. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 12 ++++++++++++ net/bridge/br_netlink.c | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ff957604a721..c80f95f6ee78 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -215,6 +215,18 @@ enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_NONE, }; +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05aeea1222a7..bcac09fe2ac8 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -484,6 +484,29 @@ static size_t br_port_get_slave_size(const struct net_device *brdev, return br_port_info_size(); } +static size_t br_get_size(const struct net_device *brdev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + 0; +} + +static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) +{ + struct net_bridge *br = netdev_priv(brdev); + u32 forward_delay = jiffies_to_clock_t(br->forward_delay); + u32 hello_time = jiffies_to_clock_t(br->hello_time); + u32 age_time = jiffies_to_clock_t(br->max_age); + + if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || + nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + return -EMSGSIZE; + + return 0; +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -514,6 +537,8 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .validate = br_validate, .newlink = br_dev_newlink, .dellink = br_dev_delete, + .get_size = br_get_size, + .fill_info = br_fill_info, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, -- cgit v1.2.3 From dc8ecdd3a3fccf73fcb07711cde064ce5727f9d1 Mon Sep 17 00:00:00 2001 From: RafaÅ‚ MiÅ‚ecki Date: Mon, 1 Sep 2014 23:11:06 +0200 Subject: bcma: move bus struct setup into early part of host specific code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is important for SoC host. In future we will want to know chip ID (needed for early MIPS boot) before doing cores scanning. Signed-off-by: RafaÅ‚ MiÅ‚ecki Acked-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/host_pci.c | 3 +++ drivers/bcma/host_soc.c | 3 +++ drivers/bcma/main.c | 2 -- drivers/bcma/scan.c | 7 ------- include/linux/bcma/bcma.h | 1 - 5 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index f032ed6dd459..1e5ac0a79696 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -208,6 +208,9 @@ static int bcma_host_pci_probe(struct pci_dev *dev, bus->boardinfo.vendor = bus->host_pci->subsystem_vendor; bus->boardinfo.type = bus->host_pci->subsystem_device; + /* Initialize struct, detect chip */ + bcma_init_bus(bus); + /* Register */ err = bcma_bus_register(bus); if (err) diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c index 1edd7e064621..379e0d4ebe72 100644 --- a/drivers/bcma/host_soc.c +++ b/drivers/bcma/host_soc.c @@ -178,6 +178,9 @@ int __init bcma_host_soc_register(struct bcma_soc *soc) bus->hosttype = BCMA_HOSTTYPE_SOC; bus->ops = &bcma_host_soc_ops; + /* Initialize struct, detect chip */ + bcma_init_bus(bus); + /* Register */ err = bcma_bus_early_register(bus, &soc->core_cc, &soc->core_mips); if (err) diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index 0ff8d58831ef..9f6b0cb9a12c 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -334,8 +334,6 @@ int __init bcma_bus_early_register(struct bcma_bus *bus, struct bcma_device *core; struct bcma_device_id match; - bcma_init_bus(bus); - match.manuf = BCMA_MANUF_BCM; match.id = bcma_cc_core_id(bus); match.class = BCMA_CL_SIM; diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index e9bd77249a4c..e2b990303042 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -438,9 +438,6 @@ void bcma_init_bus(struct bcma_bus *bus) s32 tmp; struct bcma_chipinfo *chipinfo = &(bus->chipinfo); - if (bus->init_done) - return; - INIT_LIST_HEAD(&bus->cores); bus->nr_cores = 0; @@ -452,8 +449,6 @@ void bcma_init_bus(struct bcma_bus *bus) chipinfo->pkg = (tmp & BCMA_CC_ID_PKG) >> BCMA_CC_ID_PKG_SHIFT; bcma_info(bus, "Found chip with id 0x%04X, rev 0x%02X and package 0x%02X\n", chipinfo->id, chipinfo->rev, chipinfo->pkg); - - bus->init_done = true; } int bcma_bus_scan(struct bcma_bus *bus) @@ -463,8 +458,6 @@ int bcma_bus_scan(struct bcma_bus *bus) int err, core_num = 0; - bcma_init_bus(bus); - erombase = bcma_scan_read32(bus, 0, BCMA_CC_EROM); if (bus->hosttype == BCMA_HOSTTYPE_SOC) { eromptr = ioremap_nocache(erombase, BCMA_CORE_SIZE); diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 0272e49135d0..c1ba87d1548e 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -332,7 +332,6 @@ struct bcma_bus { struct bcma_device *mapped_core; struct list_head cores; u8 nr_cores; - u8 init_done:1; u8 num; struct bcma_drv_cc drv_cc; -- cgit v1.2.3 From a395135ddebb0a06052b84c309eb6cb68b79c797 Mon Sep 17 00:00:00 2001 From: RafaÅ‚ MiÅ‚ecki Date: Mon, 1 Sep 2014 23:11:07 +0200 Subject: bcma: use separated function to initialize bus on SoC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is required to split SoC bus init into two phases. The later one (which includes scanning) should be called when kalloc is available. Cc: Ralf Baechle Signed-off-by: RafaÅ‚ MiÅ‚ecki Acked-by: Hauke Mehrtens Signed-off-by: John W. Linville --- arch/mips/bcm47xx/setup.c | 4 ++++ drivers/bcma/host_soc.c | 11 +++++++++-- include/linux/bcma/bcma_soc.h | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 2b63e7e7d3d3..fff6ed4978c7 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -201,6 +201,10 @@ static void __init bcm47xx_register_bcma(void) pr_warn("bcm47xx: someone else already registered a bcma SPROM callback handler (err %d)\n", err); err = bcma_host_soc_register(&bcm47xx_bus.bcma); + if (err) + panic("Failed to register BCMA bus (err %d)", err); + + err = bcma_host_soc_init(&bcm47xx_bus.bcma); if (err) panic("Failed to initialize BCMA bus (err %d)", err); diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c index 379e0d4ebe72..718e054dd727 100644 --- a/drivers/bcma/host_soc.c +++ b/drivers/bcma/host_soc.c @@ -165,7 +165,6 @@ static const struct bcma_host_ops bcma_host_soc_ops = { int __init bcma_host_soc_register(struct bcma_soc *soc) { struct bcma_bus *bus = &soc->bus; - int err; /* iomap only first core. We have to read some register on this core * to scan the bus. @@ -181,7 +180,15 @@ int __init bcma_host_soc_register(struct bcma_soc *soc) /* Initialize struct, detect chip */ bcma_init_bus(bus); - /* Register */ + return 0; +} + +int __init bcma_host_soc_init(struct bcma_soc *soc) +{ + struct bcma_bus *bus = &soc->bus; + int err; + + /* Scan bus and initialize it */ err = bcma_bus_early_register(bus, &soc->core_cc, &soc->core_mips); if (err) iounmap(bus->mmio); diff --git a/include/linux/bcma/bcma_soc.h b/include/linux/bcma/bcma_soc.h index 4203c5593b9f..f24d245f8394 100644 --- a/include/linux/bcma/bcma_soc.h +++ b/include/linux/bcma/bcma_soc.h @@ -10,6 +10,7 @@ struct bcma_soc { }; int __init bcma_host_soc_register(struct bcma_soc *soc); +int __init bcma_host_soc_init(struct bcma_soc *soc); int bcma_bus_register(struct bcma_bus *bus); -- cgit v1.2.3 From 23a2f39c8f4035eade7f226eb7ada30c78d9eee3 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 8 Sep 2014 22:53:35 +0200 Subject: bcma: store more alternative addresses Each core could have more than one alternative address. There are cores with 8 alternative addresses for different functions. The PHY control in the Chip common B core is done through the 2. alternative address and not the first one. Signed-off-by: Hauke Mehrtens CC: linux-usb@vger.kernel.org Signed-off-by: John W. Linville --- drivers/bcma/scan.c | 9 +++++---- drivers/usb/host/bcma-hcd.c | 2 +- include/linux/bcma/bcma.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index e2b990303042..06be7282cbc4 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -276,7 +276,7 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, struct bcma_device *core) { u32 tmp; - u8 i, j; + u8 i, j, k; s32 cia, cib; u8 ports[2], wrappers[2]; @@ -367,6 +367,7 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, core->addr = tmp; /* get & parse slave ports */ + k = 0; for (i = 0; i < ports[1]; i++) { for (j = 0; ; j++) { tmp = bcma_erom_get_addr_desc(bus, eromptr, @@ -376,9 +377,9 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, /* pr_debug("erom: slave port %d " * "has %d descriptors\n", i, j); */ break; - } else { - if (i == 0 && j == 0) - core->addr1 = tmp; + } else if (k < ARRAY_SIZE(core->addr_s)) { + core->addr_s[k] = tmp; + k++; } } } diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c index 205f4a336583..cd6d0afb6b8f 100644 --- a/drivers/usb/host/bcma-hcd.c +++ b/drivers/usb/host/bcma-hcd.c @@ -237,7 +237,7 @@ static int bcma_hcd_probe(struct bcma_device *dev) bcma_hcd_init_chip(dev); /* In AI chips EHCI is addrspace 0, OHCI is 1 */ - ohci_addr = dev->addr1; + ohci_addr = dev->addr_s[0]; if ((chipinfo->id == 0x5357 || chipinfo->id == 0x4749) && chipinfo->rev == 0) ohci_addr = 0x18009000; diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index c1ba87d1548e..7fc16c991291 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -267,7 +267,7 @@ struct bcma_device { u8 core_unit; u32 addr; - u32 addr1; + u32 addr_s[8]; u32 wrap; void __iomem *io_addr; -- cgit v1.2.3 From 1716bcf3f76fe71e98d4851a3eb73ea3d93d4773 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 8 Sep 2014 22:53:36 +0200 Subject: bcma: add support for chipcommon B core This core is used on BCM4708 to configure the PCIe and USB3 PHYs and it contains the addresses to the Device Management unit. This will be used by the PCIe driver first. Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/Makefile | 1 + drivers/bcma/bcma_private.h | 4 ++ drivers/bcma/driver_chipcommon_b.c | 61 +++++++++++++++++++++++++++++ drivers/bcma/main.c | 10 +++++ drivers/bcma/scan.c | 1 + include/linux/bcma/bcma.h | 1 + include/linux/bcma/bcma_driver_chipcommon.h | 8 ++++ 7 files changed, 86 insertions(+) create mode 100644 drivers/bcma/driver_chipcommon_b.c (limited to 'include') diff --git a/drivers/bcma/Makefile b/drivers/bcma/Makefile index 91290f7f61b8..838b4b9d352f 100644 --- a/drivers/bcma/Makefile +++ b/drivers/bcma/Makefile @@ -1,5 +1,6 @@ bcma-y += main.o scan.o core.o sprom.o bcma-y += driver_chipcommon.o driver_chipcommon_pmu.o +bcma-y += driver_chipcommon_b.o bcma-$(CONFIG_BCMA_SFLASH) += driver_chipcommon_sflash.o bcma-$(CONFIG_BCMA_NFLASH) += driver_chipcommon_nflash.o bcma-y += driver_pci.o diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index 09b632ad0fe2..b40be43c6f31 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -50,6 +50,10 @@ void bcma_chipco_serial_init(struct bcma_drv_cc *cc); extern struct platform_device bcma_pflash_dev; #endif /* CONFIG_BCMA_DRIVER_MIPS */ +/* driver_chipcommon_b.c */ +int bcma_core_chipcommon_b_init(struct bcma_drv_cc_b *ccb); +void bcma_core_chipcommon_b_free(struct bcma_drv_cc_b *ccb); + /* driver_chipcommon_pmu.c */ u32 bcma_pmu_get_alp_clock(struct bcma_drv_cc *cc); u32 bcma_pmu_get_cpu_clock(struct bcma_drv_cc *cc); diff --git a/drivers/bcma/driver_chipcommon_b.c b/drivers/bcma/driver_chipcommon_b.c new file mode 100644 index 000000000000..c20b5f4ff290 --- /dev/null +++ b/drivers/bcma/driver_chipcommon_b.c @@ -0,0 +1,61 @@ +/* + * Broadcom specific AMBA + * ChipCommon B Unit driver + * + * Copyright 2014, Hauke Mehrtens + * + * Licensed under the GNU/GPL. See COPYING for details. + */ + +#include "bcma_private.h" +#include +#include + +static bool bcma_wait_reg(struct bcma_bus *bus, void __iomem *addr, u32 mask, + u32 value, int timeout) +{ + unsigned long deadline = jiffies + timeout; + u32 val; + + do { + val = readl(addr); + if ((val & mask) == value) + return true; + cpu_relax(); + udelay(10); + } while (!time_after_eq(jiffies, deadline)); + + bcma_err(bus, "Timeout waiting for register %p\n", addr); + + return false; +} + +void bcma_chipco_b_mii_write(struct bcma_drv_cc_b *ccb, u32 offset, u32 value) +{ + struct bcma_bus *bus = ccb->core->bus; + + writel(offset, ccb->mii + 0x00); + bcma_wait_reg(bus, ccb->mii + 0x00, 0x0100, 0x0000, 100); + writel(value, ccb->mii + 0x04); + bcma_wait_reg(bus, ccb->mii + 0x00, 0x0100, 0x0000, 100); +} +EXPORT_SYMBOL_GPL(bcma_chipco_b_mii_write); + +int bcma_core_chipcommon_b_init(struct bcma_drv_cc_b *ccb) +{ + if (ccb->setup_done) + return 0; + + ccb->setup_done = 1; + ccb->mii = ioremap_nocache(ccb->core->addr_s[1], BCMA_CORE_SIZE); + if (!ccb->mii) + return -ENOMEM; + + return 0; +} + +void bcma_core_chipcommon_b_free(struct bcma_drv_cc_b *ccb) +{ + if (ccb->mii) + iounmap(ccb->mii); +} diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index 297a2d46985a..c421403cab43 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -173,6 +173,7 @@ static int bcma_register_devices(struct bcma_bus *bus) switch (core->id.id) { case BCMA_CORE_4706_CHIPCOMMON: case BCMA_CORE_CHIPCOMMON: + case BCMA_CORE_NS_CHIPCOMMON_B: case BCMA_CORE_PCI: case BCMA_CORE_PCIE: case BCMA_CORE_PCIE2: @@ -287,6 +288,13 @@ int bcma_bus_register(struct bcma_bus *bus) bcma_core_chipcommon_init(&bus->drv_cc); } + /* Init CC core */ + core = bcma_find_core(bus, BCMA_CORE_NS_CHIPCOMMON_B); + if (core) { + bus->drv_cc_b.core = core; + bcma_core_chipcommon_b_init(&bus->drv_cc_b); + } + /* Init MIPS core */ core = bcma_find_core(bus, BCMA_CORE_MIPS_74K); if (core) { @@ -341,6 +349,8 @@ void bcma_bus_unregister(struct bcma_bus *bus) else if (err) bcma_err(bus, "Can not unregister GPIO driver: %i\n", err); + bcma_core_chipcommon_b_free(&bus->drv_cc_b); + cores[0] = bcma_find_core(bus, BCMA_CORE_MIPS_74K); cores[1] = bcma_find_core(bus, BCMA_CORE_PCIE); cores[2] = bcma_find_core(bus, BCMA_CORE_4706_MAC_GBIT_COMMON); diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index 06be7282cbc4..b3a403c136fb 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -314,6 +314,7 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, /* Some specific cores don't need wrappers */ switch (core->id.id) { case BCMA_CORE_4706_MAC_GBIT_COMMON: + case BCMA_CORE_NS_CHIPCOMMON_B: /* Not used yet: case BCMA_CORE_OOB_ROUTER: */ break; default: diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 7fc16c991291..634597917670 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -335,6 +335,7 @@ struct bcma_bus { u8 num; struct bcma_drv_cc drv_cc; + struct bcma_drv_cc_b drv_cc_b; struct bcma_drv_pci drv_pci[2]; struct bcma_drv_pcie2 drv_pcie2; struct bcma_drv_mips drv_mips; diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 63d105cd14a3..db6fa217f98b 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -644,6 +644,12 @@ struct bcma_drv_cc { #endif }; +struct bcma_drv_cc_b { + struct bcma_device *core; + u8 setup_done:1; + void __iomem *mii; +}; + /* Register access */ #define bcma_cc_read32(cc, offset) \ bcma_read32((cc)->core, offset) @@ -699,4 +705,6 @@ extern void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid); extern u32 bcma_pmu_get_bus_clock(struct bcma_drv_cc *cc); +void bcma_chipco_b_mii_write(struct bcma_drv_cc_b *ccb, u32 offset, u32 value); + #endif /* LINUX_BCMA_DRIVER_CC_H_ */ -- cgit v1.2.3 From 738cbe72adc5c8f2016c4c68aa5162631d4f27e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:47 +0200 Subject: net: bpf: consolidate JIT binary allocator Introduced in commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") and later on replicated in aa2d2c73c21f ("s390/bpf,jit: address randomize and write protect jit code") for s390 architecture, write protection for BPF JIT images got added and a random start address of the JIT code, so that it's not on a page boundary anymore. Since both use a very similar allocator for the BPF binary header, we can consolidate this code into the BPF core as it's mostly JIT independant anyway. This will also allow for future archs that support DEBUG_SET_MODULE_RONX to just reuse instead of reimplementing it. JIT tested on x86_64 and s390x with BPF test suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Eric Dumazet Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 45 ++++++++------------------------------- arch/x86/net/bpf_jit_comp.c | 50 ++++++++++---------------------------------- include/linux/filter.h | 13 ++++++++++++ kernel/bpf/core.c | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index f2833c5b218a..b734f975c22e 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -5,11 +5,9 @@ * * Author(s): Martin Schwidefsky */ -#include #include #include #include -#include #include #include #include @@ -148,6 +146,12 @@ struct bpf_jit { ret; \ }) +static void bpf_jit_fill_hole(void *area, unsigned int size) +{ + /* Fill whole space with illegal instructions */ + memset(area, 0, size); +} + static void bpf_jit_prologue(struct bpf_jit *jit) { /* Save registers and create stack frame if necessary */ @@ -780,38 +784,6 @@ out: return -1; } -/* - * Note: for security reasons, bpf code will follow a randomly - * sized amount of illegal instructions. - */ -struct bpf_binary_header { - unsigned int pages; - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int bpfsize, - u8 **image_ptr) -{ - struct bpf_binary_header *header; - unsigned int sz, hole; - - /* Most BPF filters are really small, but if some of them fill a page, - * allow at least 128 extra bytes for illegal instructions. - */ - sz = round_up(bpfsize + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - memset(header, 0, sz); - header->pages = sz / PAGE_SIZE; - hole = min(sz - (bpfsize + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - /* Insert random number of illegal instructions before BPF code - * and make sure the first instruction starts at an even address. - */ - *image_ptr = &header->image[(prandom_u32() % hole) & -2]; - return header; -} - void bpf_jit_compile(struct bpf_prog *fp) { struct bpf_binary_header *header = NULL; @@ -850,7 +822,8 @@ void bpf_jit_compile(struct bpf_prog *fp) size = prg_len + lit_len; if (size >= BPF_SIZE_MAX) goto out; - header = bpf_alloc_binary(size, &jit.start); + header = bpf_jit_binary_alloc(size, &jit.start, + 2, bpf_jit_fill_hole); if (!header) goto out; jit.prg = jit.mid = jit.start + prg_len; @@ -884,7 +857,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 06f8c17f5484..9de0b5476b0c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,12 +8,10 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include #include #include #include -#include +#include int bpf_jit_enable __read_mostly; @@ -109,39 +107,6 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -struct bpf_binary_header { - unsigned int pages; - /* Note : for security reasons, bpf code will follow a randomly - * sized amount of int3 instructions - */ - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, - u8 **image_ptr) -{ - unsigned int sz, hole; - struct bpf_binary_header *header; - - /* Most of BPF filters are really small, - * but if some of them fill a page, allow at least - * 128 extra bytes to insert a random section of int3 - */ - sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - - memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ - - header->pages = sz / PAGE_SIZE; - hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - - /* insert a random number of int3 instructions before BPF code */ - *image_ptr = &header->image[prandom_u32() % hole]; - return header; -} - /* pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_REG + 1) @@ -206,6 +171,12 @@ static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +static void jit_fill_hole(void *area, unsigned int size) +{ + /* fill whole space with int3 instructions */ + memset(area, 0xcc, size); +} + struct jit_context { unsigned int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; @@ -959,7 +930,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) if (proglen <= 0) { image = NULL; if (header) - module_free(NULL, header); + bpf_jit_binary_free(header); goto out; } if (image) { @@ -969,7 +940,8 @@ void bpf_int_jit_compile(struct bpf_prog *prog) break; } if (proglen == oldproglen) { - header = bpf_alloc_binary(proglen, &image); + header = bpf_jit_binary_alloc(proglen, &image, + 1, jit_fill_hole); if (!header) goto out; } @@ -998,7 +970,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/include/linux/filter.h b/include/linux/filter.h index 8f82ef3f1cdd..868764fcffb8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -289,6 +289,11 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +struct bpf_binary_header { + unsigned int pages; + u8 image[]; +}; + struct bpf_work_struct { struct bpf_prog *prog; struct work_struct work; @@ -358,6 +363,14 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2c2bfaacce66..8ee520f0ec70 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -20,9 +20,12 @@ * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ + #include #include #include +#include +#include #include /* Registers */ @@ -125,6 +128,42 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_free(NULL, hdr); +} + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. -- cgit v1.2.3 From 286aad3c4014ca825c447e07e24f8929e6d266d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:49 +0200 Subject: net: bpf: be friendly to kmemcheck Reported by Mikulas Patocka, kmemcheck currently barks out a false positive since we don't have special kmemcheck annotation for bitfields used in bpf_prog structure. We currently have jited:1, len:31 and thus when accessing len while CONFIG_KMEMCHECK enabled, kmemcheck throws a warning that we're reading uninitialized memory. As we don't need the whole bit universe for pages member, we can just split it to u16 and use a bool flag for jited instead of a bitfield. Signed-off-by: Mikulas Patocka Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 6 +++--- net/core/filter.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 2d1a5b93d91c..6b45f649eff0 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -933,7 +933,7 @@ void bpf_jit_compile(struct bpf_prog *fp) set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); return; diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index cfa83cf2447d..0e97ccd29fe3 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1417,7 +1417,7 @@ void bpf_jit_compile(struct bpf_prog *fp) bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 40c53ff59124..cbae2dfd053c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -686,7 +686,7 @@ void bpf_jit_compile(struct bpf_prog *fp) ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index b734f975c22e..555f5c7e83ab 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -842,7 +842,7 @@ void bpf_jit_compile(struct bpf_prog *fp) if (jit.start) { set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *) jit.start; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f7a736b645e8..b2ad9dc5425e 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -801,7 +801,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (image) { bpf_flush_icache(image, image + proglen); fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9de0b5476b0c..d56cd1f515bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -955,7 +955,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = 1; + prog->jited = true; } out: kfree(addrs); diff --git a/include/linux/filter.h b/include/linux/filter.h index 868764fcffb8..4b59edead908 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -300,9 +300,9 @@ struct bpf_work_struct { }; struct bpf_prog { - u32 pages; /* Number of allocated pages */ - u32 jited:1, /* Is our filter JIT'ed? */ - len:31; /* Number of filter blocks */ + u16 pages; /* Number of allocated pages */ + bool jited; /* Is our filter JIT'ed? */ + u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index fa5b7d0f77ac..dfc716ffa44b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -972,7 +972,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) int err; fp->bpf_func = NULL; - fp->jited = 0; + fp->jited = false; err = bpf_check_classic(fp->insns, fp->len); if (err) { -- cgit v1.2.3 From 67cc0d4077951295f42bed63805e91b46c24477b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Sep 2014 19:58:58 -0400 Subject: net-timestamp: optimize sock_tx_timestamp default path Few packets have timestamping enabled. Exit sock_tx_timestamp quickly in this common case. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++++- net/socket.c | 7 ++----- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 049ab1b732a6..515a4d01e932 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2199,6 +2199,8 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, sk->sk_stamp = skb->tstamp; } +void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags); + /** * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet @@ -2206,7 +2208,13 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, * * Note : callers should take care of initial *tx_flags value (usually 0) */ -void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags); +static inline void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +{ + if (unlikely(sk->sk_tsflags)) + __sock_tx_timestamp(sk, tx_flags); + if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) + *tx_flags |= SKBTX_WIFI_STATUS; +} /** * sk_eat_skb - Release a skb if it is no longer needed diff --git a/net/socket.c b/net/socket.c index 2e2586e2dee1..d40f522541aa 100644 --- a/net/socket.c +++ b/net/socket.c @@ -610,7 +610,7 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) { u8 flags = *tx_flags; @@ -626,12 +626,9 @@ void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) flags |= SKBTX_ACK_TSTAMP; - if (sock_flag(sk, SOCK_WIFI_STATUS)) - flags |= SKBTX_WIFI_STATUS; - *tx_flags = flags; } -EXPORT_SYMBOL(sock_tx_timestamp); +EXPORT_SYMBOL(__sock_tx_timestamp); static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) -- cgit v1.2.3 From b954d83421d51d822c42e5ab7b65069b25ad3005 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Sep 2014 15:01:02 +0200 Subject: net: bpf: only build bpf_jit_binary_{alloc, free}() when jit selected Since BPF JIT depends on the availability of module_alloc() and module_free() helpers (HAVE_BPF_JIT and MODULES), we better build that code only in case we have BPF_JIT in our config enabled, just like with other JIT code. Fixes builds for arm/marzen_defconfig and sh/rsk7269_defconfig. ==================== kernel/built-in.o: In function `bpf_jit_binary_alloc': /home/cwang/linux/kernel/bpf/core.c:144: undefined reference to `module_alloc' kernel/built-in.o: In function `bpf_jit_binary_free': /home/cwang/linux/kernel/bpf/core.c:164: undefined reference to `module_free' make: *** [vmlinux] Error 1 ==================== Reported-by: Fengguang Wu Fixes: 738cbe72adc5 ("net: bpf: consolidate JIT binary allocator") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 78 +++++++++++++++++++++++++------------------------- kernel/bpf/core.c | 2 ++ 2 files changed, 41 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4b59edead908..1a0bc6d134d7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -4,12 +4,18 @@ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ +#include + #include #include #include +#include +#include #include -#include + #include + +#include #include struct sk_buff; @@ -363,14 +369,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); -typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); - -struct bpf_binary_header * -bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, - unsigned int alignment, - bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_jit_binary_free(struct bpf_binary_header *hdr); - static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); @@ -393,6 +391,38 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); +#ifdef CONFIG_BPF_JIT +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + +void bpf_jit_compile(struct bpf_prog *fp); +void bpf_jit_free(struct bpf_prog *fp); + +static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, + u32 pass, void *image) +{ + pr_err("flen=%u proglen=%u pass=%u image=%pK\n", + flen, proglen, pass, image); + if (image) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, + 16, 1, image, proglen, false); +} +#else +static inline void bpf_jit_compile(struct bpf_prog *fp) +{ +} + +static inline void bpf_jit_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_free(fp); +} +#endif /* CONFIG_BPF_JIT */ + #define BPF_ANC BIT(15) static inline u16 bpf_anc_helper(const struct sock_filter *ftest) @@ -440,36 +470,6 @@ static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, return bpf_internal_load_pointer_neg_helper(skb, k, size); } -#ifdef CONFIG_BPF_JIT -#include -#include -#include - -void bpf_jit_compile(struct bpf_prog *fp); -void bpf_jit_free(struct bpf_prog *fp); - -static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, - u32 pass, void *image) -{ - pr_err("flen=%u proglen=%u pass=%u image=%pK\n", - flen, proglen, pass, image); - if (image) - print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, - 16, 1, image, proglen, false); -} -#else -#include - -static inline void bpf_jit_compile(struct bpf_prog *fp) -{ -} - -static inline void bpf_jit_free(struct bpf_prog *fp) -{ - bpf_prog_unlock_free(fp); -} -#endif /* CONFIG_BPF_JIT */ - static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8ee520f0ec70..8b7002488251 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -128,6 +128,7 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +#ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -163,6 +164,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_free(NULL, hdr); } +#endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs -- cgit v1.2.3 From 78f686cae0c67a2edd167cbbe2f36017f0fa4b30 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2014 22:28:06 +0300 Subject: cfg80211: don't put kek/kck/replay counter on the stack There's no need to put the values on the stack, just pass a pointer to the data in the nl80211 message. This reduces stack usage and avoids potential issues with putting sensitive data on the stack. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++------ net/wireless/nl80211.c | 10 +++------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0d17ec9df692..c2c710c14a50 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1980,14 +1980,12 @@ struct cfg80211_wowlan_wakeup { /** * struct cfg80211_gtk_rekey_data - rekey data - * @kek: key encryption key - * @kck: key confirmation key - * @replay_ctr: replay counter + * @kek: key encryption key (NL80211_KEK_LEN bytes) + * @kck: key confirmation key (NL80211_KCK_LEN bytes) + * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) */ struct cfg80211_gtk_rekey_data { - u8 kek[NL80211_KEK_LEN]; - u8 kck[NL80211_KCK_LEN]; - u8 replay_ctr[NL80211_REPLAY_CTR_LEN]; + const u8 *kek, *kck, *replay_ctr; }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e388a9f28895..bebdf3d0ae75 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8959,13 +8959,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) return -ERANGE; - memcpy(rekey_data.kek, nla_data(tb[NL80211_REKEY_DATA_KEK]), - NL80211_KEK_LEN); - memcpy(rekey_data.kck, nla_data(tb[NL80211_REKEY_DATA_KCK]), - NL80211_KCK_LEN); - memcpy(rekey_data.replay_ctr, - nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]), - NL80211_REPLAY_CTR_LEN); + rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); + rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); wdev_lock(wdev); if (!wdev->current_bss) { -- cgit v1.2.3 From 960d01acf62747d6518694f92be5b06f67473833 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Sep 2014 22:55:35 +0300 Subject: cfg80211: add WMM traffic stream API Add nl80211 and driver API to validate, add and delete traffic streams with appropriate settings. The API calls for userspace doing the action frame handshake with the peer, and then allows only to set up the parameters in the driver. To avoid setting up a session only to tear it down again, the validate API is provided, but the real usage later can still fail so userspace must be prepared for that. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++ include/net/cfg80211.h | 23 ++++++++- include/uapi/linux/nl80211.h | 28 +++++++++++ net/wireless/nl80211.c | 109 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 31 ++++++++++++ net/wireless/trace.h | 45 ++++++++++++++++++ 6 files changed, 239 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 61a09f0644aa..b1be39c76931 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -166,8 +166,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_MESH_ID_LEN 32 +#define IEEE80211_FIRST_TSPEC_TSID 8 #define IEEE80211_NUM_TIDS 16 +/* number of user priorities 802.11 uses */ +#define IEEE80211_NUM_UPS 8 + #define IEEE80211_QOS_CTL_LEN 2 /* 1d tag mask */ #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c2c710c14a50..a13beab123dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2318,6 +2318,17 @@ struct cfg80211_qos_map { * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the * given interface This is used e.g. for dynamic HT 20/40 MHz channel width * changes during the lifetime of the BSS. + * + * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device + * with the given parameters; action frame exchange has been handled by + * userspace so this just has to modify the TX path to take the TS into + * account. + * If the admitted time is 0 just validate the parameters to make sure + * the session can be created at all; it is valid to just always return + * success for that but that may result in inefficient behaviour (handshake + * with the peer followed by immediate teardown when the addition is later + * rejected) + * @del_tx_ts: remove an existing TX TS */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2558,6 +2569,12 @@ struct cfg80211_ops { int (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef); + + int (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer, u8 user_prio, + u16 admitted_time); + int (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer); }; /* @@ -2604,9 +2621,13 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). + * @WIPHY_FLAG_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM + * TSPEC sessions (TID aka TSID 0-7) with the NL80211_CMD_ADD_TX_TS + * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it + * needs to be able to handle Block-Ack agreements and other things. */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 29c4399e08b9..e5b8caf5e466 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -722,6 +722,22 @@ * QoS mapping is relevant for IP packets, it is only valid during an * association. This is cleared on disassociation and AP restart. * + * @NL80211_CMD_ADD_TX_TS: Ask the kernel to add a traffic stream for the given + * %NL80211_ATTR_TSID and %NL80211_ATTR_MAC with %NL80211_ATTR_USER_PRIO + * and %NL80211_ATTR_ADMITTED_TIME parameters. + * Note that the action frame handshake with the AP shall be handled by + * userspace via the normal management RX/TX framework, this only sets + * up the TX TS in the driver/device. + * If the admitted time attribute is not added then the request just checks + * if a subsequent setup could be successful, the intent is to use this to + * avoid setting up a session with the AP when local restrictions would + * make that impossible. However, the subsequent "real" setup may still + * fail even if the check was successful. + * @NL80211_CMD_DEL_TX_TS: Remove an existing TS with the %NL80211_ATTR_TSID + * and %NL80211_ATTR_MAC parameters. It isn't necessary to call this + * before removing a station entry entirely, or before disassociating + * or similar, cleanup will happen in the driver/device in this case. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -893,6 +909,9 @@ enum nl80211_commands { NL80211_CMD_SET_QOS_MAP, + NL80211_CMD_ADD_TX_TS, + NL80211_CMD_DEL_TX_TS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1611,6 +1630,11 @@ enum nl80211_commands { * drivers to indicate dynack capability. Dynack is automatically disabled * setting valid value for coverage class. * + * @NL80211_ATTR_TSID: a TSID value (u8 attribute) + * @NL80211_ATTR_USER_PRIO: user priority value (u8 attribute) + * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds + * (per second) (u16 attribute) + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1957,6 +1981,10 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_DYN_ACK, + NL80211_ATTR_TSID, + NL80211_ATTR_USER_PRIO, + NL80211_ATTR_ADMITTED_TIME, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e9fbd4f4ddb0..ab7ee4893e40 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -391,6 +391,9 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, + [NL80211_ATTR_TSID] = { .type = NLA_U8 }, + [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, + [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1510,6 +1513,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); CMD(set_qos_map, SET_QOS_MAP); + if (rdev->wiphy.flags & + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION) + CMD(add_tx_ts, ADD_TX_TS); } /* add into the if now */ #undef CMD @@ -9390,6 +9396,93 @@ static int nl80211_set_qos_map(struct sk_buff *skb, return ret; } +static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid, up; + u16 admitted_time = 0; + int err; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_USER_PRIO]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + if (tsid >= IEEE80211_NUM_TIDS) + return -EINVAL; + + up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); + if (up >= IEEE80211_NUM_UPS) + return -EINVAL; + + /* WMM uses TIDs 0-7 even for TSPEC */ + if (tsid < IEEE80211_FIRST_TSPEC_TSID) { + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EINVAL; + } else { + /* TODO: handle 802.11 TSPEC/admission control + * need more attributes for that (e.g. BA session requirement) + */ + return -EINVAL; + } + + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) { + admitted_time = + nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]); + if (!admitted_time) + return -EINVAL; + } + + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time); + + out: + wdev_unlock(wdev); + return err; +} + +static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid; + int err; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + wdev_lock(wdev); + err = rdev_del_tx_ts(rdev, dev, tsid, peer); + wdev_unlock(wdev); + + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -10147,6 +10240,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ADD_TX_TS, + .doit = nl80211_add_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_TX_TS, + .doit = nl80211_del_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 56c2240c30ce..f6d457d6a558 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -915,4 +915,35 @@ rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_add_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer, + u8 user_prio, u16 admitted_time) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + if (rdev->ops->add_tx_ts) + ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_del_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); + if (rdev->ops->del_tx_ts) + ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 0c524cd76c83..625a6e6d1168 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1896,6 +1896,51 @@ TRACE_EVENT(rdev_set_ap_chanwidth, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(rdev_add_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time), + TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + __field(u8, user_prio) + __field(u16, admitted_time) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + __entry->user_prio = user_prio; + __entry->admitted_time = admitted_time; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->tsid, __entry->user_prio, __entry->admitted_time) +); + +TRACE_EVENT(rdev_del_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer), + TP_ARGS(wiphy, netdev, tsid, peer), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3 From b0b6aa2c8e0d0e34f7658d5cc1e4fbb59f701c42 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 9 Sep 2014 17:09:45 +0300 Subject: cfg80211/mac80211: add wmm info to assoc event Userspace might need to know what queues are configured for uapsd (e.g. for setting proper default values in tspecs). Add this bitmap to the association event (inside wmm nested attribute) Add additional parameter to cfg80211_rx_assoc_resp, and update its callers. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- net/mac80211/mlme.c | 9 ++++++++- net/wireless/mlme.c | 4 ++-- net/wireless/nl80211.c | 28 +++++++++++++++++++++------- net/wireless/nl80211.h | 3 ++- 5 files changed, 36 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a13beab123dc..2ed1f80f7eb6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3944,6 +3944,7 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * moves to cfg80211 in this call * @buf: authentication frame (header + body) * @len: length of the frame data + * @uapsd_queues: bitmap of ACs configured to uapsd. -1 if n/a. * * After being asked to associate via cfg80211_ops::assoc() the driver must * call either this function or cfg80211_auth_timeout(). @@ -3952,7 +3953,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); */ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *buf, size_t len); + const u8 *buf, size_t len, + int uapsd_queues); /** * cfg80211_assoc_timeout - notification of timed out association diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a7b92f5f7161..efa41fc66c1f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2817,6 +2817,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee802_11_elems elems; + int ac, uapsd_queues = -1; u8 *pos; bool reassoc; struct cfg80211_bss *bss; @@ -2886,9 +2887,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, * is set can cause the interface to go idle */ ieee80211_destroy_assoc_data(sdata, true); + + /* get uapsd queues configuration */ + uapsd_queues = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + if (sdata->tx_conf[ac].uapsd) + uapsd_queues |= BIT(ac); } - cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len); + cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 369fc334fdad..2c52b59e43f3 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -19,7 +19,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *buf, size_t len) + const u8 *buf, size_t len, int uapsd_queues) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -43,7 +43,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, return; } - nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL); + nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, status_code, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ab7ee4893e40..e8114c7a37e3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10524,7 +10524,8 @@ nla_put_failure: static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, - enum nl80211_commands cmd, gfp_t gfp) + enum nl80211_commands cmd, gfp_t gfp, + int uapsd_queues) { struct sk_buff *msg; void *hdr; @@ -10544,6 +10545,19 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_FRAME, len, buf)) goto nla_put_failure; + if (uapsd_queues >= 0) { + struct nlattr *nla_wmm = + nla_nest_start(msg, NL80211_ATTR_STA_WME); + if (!nla_wmm) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_STA_WME_UAPSD_QUEUES, + uapsd_queues)) + goto nla_put_failure; + + nla_nest_end(msg, nla_wmm); + } + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -10560,15 +10574,15 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_AUTHENTICATE, gfp); + NL80211_CMD_AUTHENTICATE, gfp, -1); } void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) + size_t len, gfp_t gfp, int uapsd_queues) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_ASSOCIATE, gfp); + NL80211_CMD_ASSOCIATE, gfp, uapsd_queues); } void nl80211_send_deauth(struct cfg80211_registered_device *rdev, @@ -10576,7 +10590,7 @@ void nl80211_send_deauth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DEAUTHENTICATE, gfp); + NL80211_CMD_DEAUTHENTICATE, gfp, -1); } void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, @@ -10584,7 +10598,7 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DISASSOCIATE, gfp); + NL80211_CMD_DISASSOCIATE, gfp, -1); } void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, @@ -10605,7 +10619,7 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, cmd = NL80211_CMD_UNPROT_DISASSOCIATE; trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); - nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC); + nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1); } EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 49c9a482dd12..7ad70d6f0cc6 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -23,7 +23,8 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, gfp_t gfp, + int uapsd_queues); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); -- cgit v1.2.3 From 18998c381b19bfc3c285361ff6200ded7444aa2c Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Sep 2014 14:07:34 +0300 Subject: cfg80211: allow requesting SMPS mode on ap start Add feature bits to indicate device support for static-smps and dynamic-smps modes. Add a new NL80211_ATTR_SMPS_MODE attribue to allow configuring the smps mode to be used by the ap (e.g. configuring to ap to dynamic smps mode will reduce power consumption while having minor effect on throughput) Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 33 +++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 24 ++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2ed1f80f7eb6..a2ddcf2398fd 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -664,6 +664,7 @@ struct cfg80211_acl_data { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) + * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -682,6 +683,7 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; + enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e5b8caf5e466..4b28dc07bcb1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1635,6 +1635,9 @@ enum nl80211_commands { * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds * (per second) (u16 attribute) * + * @NL80211_ATTR_SMPS_MODE: SMPS mode to use (ap mode). see + * &enum nl80211_smps_mode. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1985,6 +1988,8 @@ enum nl80211_attrs { NL80211_ATTR_USER_PRIO, NL80211_ATTR_ADMITTED_TIME, + NL80211_ATTR_SMPS_MODE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4030,6 +4035,13 @@ enum nl80211_ap_sme_features { * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used * to enable dynack. + * @NL80211_FEATURE_STATIC_SMPS: Device supports static spatial + * multiplexing powersave, ie. can turn off all but one chain + * even on HT connections that should be using more chains. + * @NL80211_FEATURE_DYNAMIC_SMPS: Device supports dynamic spatial + * multiplexing powersave, ie. can turn off all but one chain + * and then wake the rest up as required after, for example, + * rts/cts handshake. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4056,6 +4068,8 @@ enum nl80211_feature_flags { NL80211_FEATURE_QUIET = 1 << 21, NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, + NL80211_FEATURE_STATIC_SMPS = 1 << 24, + NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, }; /** @@ -4129,6 +4143,25 @@ enum nl80211_acl_policy { NL80211_ACL_POLICY_DENY_UNLESS_LISTED, }; +/** + * enum nl80211_smps_mode - SMPS mode + * + * Requested SMPS mode (for AP mode) + * + * @NL80211_SMPS_OFF: SMPS off (use all antennas). + * @NL80211_SMPS_STATIC: static SMPS (use a single antenna) + * @NL80211_SMPS_DYNAMIC: dynamic smps (start with a single antenna and + * turn on other antennas after CTS/RTS). + */ +enum nl80211_smps_mode { + NL80211_SMPS_OFF, + NL80211_SMPS_STATIC, + NL80211_SMPS_DYNAMIC, + + __NL80211_SMPS_AFTER_LAST, + NL80211_SMPS_MAX = __NL80211_SMPS_AFTER_LAST - 1 +}; + /** * enum nl80211_radar_event - type of radar event for DFS operation * diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e8114c7a37e3..4cce3e17964d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -394,6 +394,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TSID] = { .type = NLA_U8 }, [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, + [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -3345,6 +3346,29 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + if (info->attrs[NL80211_ATTR_SMPS_MODE]) { + params.smps_mode = + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); + switch (params.smps_mode) { + case NL80211_SMPS_OFF: + break; + case NL80211_SMPS_STATIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_STATIC_SMPS)) + return -EINVAL; + break; + case NL80211_SMPS_DYNAMIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_DYNAMIC_SMPS)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } else { + params.smps_mode = NL80211_SMPS_OFF; + } + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { -- cgit v1.2.3 From 0d8614b4b926d0f657d15d7eb5125bcb24b9fd41 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Sep 2014 14:07:36 +0300 Subject: mac80211: replace SMPS hw flags with wiphy feature bits Use the new static_smps / dynamic_smps feature bits instead of mac80211-internal hw flags. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 5 +++-- drivers/net/wireless/iwlegacy/4965-mac.c | 5 ++--- drivers/net/wireless/iwlwifi/dvm/mac80211.c | 4 ++-- drivers/net/wireless/iwlwifi/mvm/mac80211.c | 8 ++++---- drivers/net/wireless/mac80211_hwsim.c | 8 ++++---- include/net/mac80211.h | 13 +------------ net/mac80211/debugfs.c | 5 ----- net/mac80211/debugfs_netdev.c | 4 ++-- net/mac80211/mlme.c | 2 +- 9 files changed, 19 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 9d61bb157189..4b5f0011ab5d 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4759,7 +4759,6 @@ int ath10k_mac_register(struct ath10k *ar) IEEE80211_HW_MFP_CAPABLE | IEEE80211_HW_REPORTS_TX_ACK_STATUS | IEEE80211_HW_HAS_RATE_CONTROL | - IEEE80211_HW_SUPPORTS_STATIC_SMPS | IEEE80211_HW_AP_LINK_PS | IEEE80211_HW_SPECTRUM_MGMT; @@ -4767,8 +4766,10 @@ int ath10k_mac_register(struct ath10k *ar) * bytes is used for padding/alignment if necessary. */ ar->hw->extra_tx_headroom += sizeof(struct htt_data_tx_desc_frag)*2 + 4; + ar->hw->wiphy->features |= NL80211_FEATURE_STATIC_SMPS; + if (ar->ht_cap_info & WMI_HT_CAP_DYNAMIC_SMPS) - ar->hw->flags |= IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS; + ar->hw->wiphy->features |= NL80211_FEATURE_DYNAMIC_SMPS; if (ar->ht_cap_info & WMI_HT_CAP_ENABLED) { ar->hw->flags |= IEEE80211_HW_AMPDU_AGGREGATION; diff --git a/drivers/net/wireless/iwlegacy/4965-mac.c b/drivers/net/wireless/iwlegacy/4965-mac.c index 3dcbe2cd2b28..9f930a0cba09 100644 --- a/drivers/net/wireless/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/iwlegacy/4965-mac.c @@ -5757,9 +5757,8 @@ il4965_mac_setup_register(struct il_priv *il, u32 max_probe_length) IEEE80211_HW_REPORTS_TX_ACK_STATUS | IEEE80211_HW_SUPPORTS_PS | IEEE80211_HW_SUPPORTS_DYNAMIC_PS; if (il->cfg->sku & IL_SKU_N) - hw->flags |= - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | - IEEE80211_HW_SUPPORTS_STATIC_SMPS; + hw->wiphy->features |= NL80211_FEATURE_DYNAMIC_SMPS | + NL80211_FEATURE_STATIC_SMPS; hw->sta_data_size = sizeof(struct il_station_priv); hw->vif_data_size = sizeof(struct il_vif_priv); diff --git a/drivers/net/wireless/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/iwlwifi/dvm/mac80211.c index afb98f4fdaf3..2364a3c09b9e 100644 --- a/drivers/net/wireless/iwlwifi/dvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/dvm/mac80211.c @@ -125,8 +125,8 @@ int iwlagn_mac_setup_register(struct iwl_priv *priv, */ if (priv->nvm_data->sku_cap_11n_enable) - hw->flags |= IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | - IEEE80211_HW_SUPPORTS_STATIC_SMPS; + hw->wiphy->features |= NL80211_FEATURE_DYNAMIC_SMPS | + NL80211_FEATURE_STATIC_SMPS; /* * Enable 11w if advertised by firmware and software crypto diff --git a/drivers/net/wireless/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/iwlwifi/mvm/mac80211.c index 7c8796584c25..395fc05638c2 100644 --- a/drivers/net/wireless/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/mvm/mac80211.c @@ -301,9 +301,7 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) IEEE80211_HW_AMPDU_AGGREGATION | IEEE80211_HW_TIMING_BEACON_ONLY | IEEE80211_HW_CONNECTION_MONITOR | - IEEE80211_HW_CHANCTX_STA_CSA | - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | - IEEE80211_HW_SUPPORTS_STATIC_SMPS; + IEEE80211_HW_CHANCTX_STA_CSA; hw->queues = mvm->first_agg_queue; hw->offchannel_tx_hw_queue = IWL_MVM_OFFCHANNEL_QUEUE; @@ -405,7 +403,9 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) hw->wiphy->features |= NL80211_FEATURE_P2P_GO_CTWIN | NL80211_FEATURE_LOW_PRIORITY_SCAN | - NL80211_FEATURE_P2P_GO_OPPPS; + NL80211_FEATURE_P2P_GO_OPPPS | + NL80211_FEATURE_DYNAMIC_SMPS | + NL80211_FEATURE_STATIC_SMPS; mvm->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 1326f6121835..babbdc1ce741 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2045,8 +2045,6 @@ static int mac80211_hwsim_create_radio(int channels, const char *reg_alpha2, hw->flags = IEEE80211_HW_MFP_CAPABLE | IEEE80211_HW_SIGNAL_DBM | - IEEE80211_HW_SUPPORTS_STATIC_SMPS | - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | IEEE80211_HW_AMPDU_AGGREGATION | IEEE80211_HW_WANT_MONITOR_VIF | IEEE80211_HW_QUEUE_CONTROL | @@ -2059,8 +2057,10 @@ static int mac80211_hwsim_create_radio(int channels, const char *reg_alpha2, WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | WIPHY_FLAG_AP_UAPSD | WIPHY_FLAG_HAS_CHANNEL_SWITCH; - hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR; - hw->wiphy->features |= NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE; + hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR | + NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | + NL80211_FEATURE_STATIC_SMPS | + NL80211_FEATURE_DYNAMIC_SMPS; /* ask mac80211 to reserve space for magic */ hw->vif_data_size = sizeof(struct hwsim_vif_priv); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c6e6a6804ee4..0ad1f47d2dc7 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1537,16 +1537,6 @@ struct ieee80211_tx_control { * @IEEE80211_HW_MFP_CAPABLE: * Hardware supports management frame protection (MFP, IEEE 802.11w). * - * @IEEE80211_HW_SUPPORTS_STATIC_SMPS: - * Hardware supports static spatial multiplexing powersave, - * ie. can turn off all but one chain even on HT connections - * that should be using more chains. - * - * @IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS: - * Hardware supports dynamic spatial multiplexing powersave, - * ie. can turn off all but one chain and then wake the rest - * up as required after, for example, rts/cts handshake. - * * @IEEE80211_HW_SUPPORTS_UAPSD: * Hardware supports Unscheduled Automatic Power Save Delivery * (U-APSD) in managed mode. The mode is configured with @@ -1632,8 +1622,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS = 1<<12, IEEE80211_HW_MFP_CAPABLE = 1<<13, IEEE80211_HW_WANT_MONITOR_VIF = 1<<14, - IEEE80211_HW_SUPPORTS_STATIC_SMPS = 1<<15, - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS = 1<<16, + /* free slots */ IEEE80211_HW_SUPPORTS_UAPSD = 1<<17, IEEE80211_HW_REPORTS_TX_ACK_STATUS = 1<<18, IEEE80211_HW_CONNECTION_MONITOR = 1<<19, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 574656174f91..54a189f0393e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -303,11 +303,6 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_STATIC_SMPS\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) - sf += scnprintf(buf + sf, mxln - sf, - "SUPPORTS_DYNAMIC_SMPS\n"); if (local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_UAPSD\n"); if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index e205ebabfa50..c68896adfa96 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -226,12 +226,12 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int err; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) && + if (!(local->hw.wiphy->features & NL80211_FEATURE_STATIC_SMPS) && smps_mode == IEEE80211_SMPS_STATIC) return -EINVAL; /* auto should be dynamic if in PS mode */ - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) && + if (!(local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) && (smps_mode == IEEE80211_SMPS_DYNAMIC || smps_mode == IEEE80211_SMPS_AUTOMATIC)) return -EINVAL; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index efa41fc66c1f..9d4ccb2cf3c9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3805,7 +3805,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len; ifmgd->p2p_noa_index = -1; - if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + if (sdata->local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC; else ifmgd->req_smps = IEEE80211_SMPS_OFF; -- cgit v1.2.3 From 67981fefb20e717cea55b42f9081a833fa46b3be Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 16:55:04 +0200 Subject: netfilter: fix compilation of masquerading without IP_NF_TARGET_MASQUERADE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_NF_NAT_MASQUERADE_IPV6=m # CONFIG_IP6_NF_TARGET_MASQUERADE is not set results in: net/ipv6/netfilter/nf_nat_masquerade_ipv6.c: In function ‘nf_nat_masquerade_ipv6’: net/ipv6/netfilter/nf_nat_masquerade_ipv6.c:41:14: error: ‘struct nf_conn_nat’ has no member named ‘masq_index’ nfct_nat(ct)->masq_index = out->ifindex; ^ net/ipv6/netfilter/nf_nat_masquerade_ipv6.c: In function ‘device_cmp’: net/ipv6/netfilter/nf_nat_masquerade_ipv6.c:61:12: error: ‘const struct nf_conn_nat’ has no member named ‘masq_index’ return nat->masq_index == (int)(long)ifindex; ^ net/ipv6/netfilter/nf_nat_masquerade_ipv6.c:62:1: warning: control reaches end of non-void function [-Wreturn-type] } ^ make[3]: *** [net/ipv6/netfilter/nf_nat_masquerade_ipv6.o] Error 1 Fix this by using the new NF_NAT_MASQUERADE_IPV4 and _IPV6 symbols in include/net/netfilter/nf_nat.h. Reported-by: Jim Davis Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index a71dd333ac68..344b1ab19220 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -32,10 +32,8 @@ struct nf_conn_nat { struct hlist_node bysource; struct nf_conn *ct; union nf_conntrack_nat_help help; -#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ - defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) || \ - defined(CONFIG_IP6_NF_TARGET_MASQUERADE) || \ - defined(CONFIG_IP6_NF_TARGET_MASQUERADE_MODULE) +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \ + IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) int masq_index; #endif }; @@ -68,8 +66,8 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, struct nf_conn_nat *nat, const struct net_device *out) { -#if IS_ENABLED(CONFIG_IP_NF_TARGET_MASQUERADE) || \ - IS_ENABLED(CONFIG_IP6_NF_TARGET_MASQUERADE) +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \ + IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) return nat->masq_index && hooknum == NF_INET_POST_ROUTING && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL && nat->masq_index != out->ifindex; -- cgit v1.2.3 From 39e393bb4f653d38aea40190e1aa9a49062eed4d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 11:02:39 +0200 Subject: netfilter: nf_tables: add NFTA_MASQ_UNSPEC to nft_masq_attributes To keep this consistent with other nft_*_attributes. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index eeec0ae845ef..66d66dd3ff79 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -806,6 +806,7 @@ enum nft_nat_attributes { * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) */ enum nft_masq_attributes { + NFTA_MASQ_UNSPEC, NFTA_MASQ_FLAGS, __NFTA_MASQ_MAX }; -- cgit v1.2.3 From 46e5da40aec256155cfedee96dd21a75da941f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:04:52 -0700 Subject: net: qdisc: use rcu prefix and silence sparse warnings Add __rcu notation to qdisc handling by doing this we can make smatch output more legible. And anyways some of the cases should be using rcu_dereference() see qdisc_all_tx_empty(), qdisc_tx_chainging(), and so on. Also *wake_queue() API is commonly called from driver timer routines without rcu lock or rtnl lock. So I added rcu_read_lock() blocks around netif_wake_subqueue and netif_tx_wake_queue. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 ++++----------------------- include/net/sch_generic.h | 21 +++++++++++++------ net/core/dev.c | 51 +++++++++++++++++++++++++++++++++++++++++++++-- net/sched/sch_generic.c | 4 ++-- net/sched/sch_mqprio.c | 6 ++++-- net/sched/sch_teql.c | 13 +++++++----- 6 files changed, 82 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ba72f6baae1a..ae721f53739e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -543,7 +543,7 @@ struct netdev_queue { * read mostly part */ struct net_device *dev; - struct Qdisc *qdisc; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; @@ -2356,12 +2356,7 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); void __netif_schedule(struct Qdisc *q); - -static inline void netif_schedule_queue(struct netdev_queue *txq) -{ - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) - __netif_schedule(txq->qdisc); -} +void netif_schedule_queue(struct netdev_queue *txq); static inline void netif_tx_schedule_all(struct net_device *dev) { @@ -2397,11 +2392,7 @@ static inline void netif_tx_start_all_queues(struct net_device *dev) } } -static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue) -{ - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) - __netif_schedule(dev_queue->qdisc); -} +void netif_tx_wake_queue(struct netdev_queue *dev_queue); /** * netif_wake_queue - restart transmit @@ -2673,19 +2664,7 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev, return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb)); } -/** - * netif_wake_subqueue - allow sending packets on subqueue - * @dev: network device - * @queue_index: sub queue index - * - * Resume individual transmit queue of a device with multiple transmit queues. - */ -static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) -{ - struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) - __netif_schedule(txq->qdisc); -} +void netif_wake_subqueue(struct net_device *dev, u16 queue_index); #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a3cfb8ebeb53..56838ab29b42 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -259,7 +259,9 @@ static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { - return qdisc->dev_queue->qdisc; + struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); + + return q; } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) @@ -384,7 +386,7 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); @@ -402,13 +404,18 @@ static inline void qdisc_reset_all_tx(struct net_device *dev) static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; + + rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - const struct Qdisc *q = txq->qdisc; + const struct Qdisc *q = rcu_dereference(txq->qdisc); - if (q->q.qlen) + if (q->q.qlen) { + rcu_read_unlock(); return false; + } } + rcu_read_unlock(); return true; } @@ -416,9 +423,10 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev) static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; + for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (txq->qdisc != txq->qdisc_sleeping) + if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping) return true; } return false; @@ -428,9 +436,10 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; + for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (txq->qdisc != &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; diff --git a/net/core/dev.c b/net/core/dev.c index 3c6a967e5830..b3d6dbc0c696 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) return (struct dev_kfree_skb_cb *)skb->cb; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +/** + * netif_wake_subqueue - allow sending packets on subqueue + * @dev: network device + * @queue_index: sub queue index + * + * Resume individual transmit queue of a device with multiple transmit queues. + */ +void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); + + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(txq->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_wake_subqueue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; @@ -3432,7 +3479,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - q = rxq->qdisc; + q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) @@ -3449,7 +3496,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, { struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - if (!rxq || rxq->qdisc == &noop_qdisc) + if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) goto out; if (*pt_prev) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 19696ebe9ebc..346ef85617d3 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -783,7 +783,7 @@ static void dev_deactivate_queue(struct net_device *dev, struct Qdisc *qdisc_default = _qdisc_default; struct Qdisc *qdisc; - qdisc = dev_queue->qdisc; + qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); @@ -876,7 +876,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, { struct Qdisc *qdisc = _qdisc; - dev_queue->qdisc = qdisc; + rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6749e2f540d0..37e7d25d21f1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -231,7 +231,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&sch->qstats, 0, sizeof(sch->qstats)); for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); spin_lock_bh(qdisc_lock(qdisc)); sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; @@ -340,7 +340,9 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + struct netdev_queue *q = netdev_get_tx_queue(dev, i); + + qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index aaa8d03ed054..5cd291bd00e4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -96,11 +96,14 @@ teql_dequeue(struct Qdisc *sch) struct teql_sched_data *dat = qdisc_priv(sch); struct netdev_queue *dat_queue; struct sk_buff *skb; + struct Qdisc *q; skb = __skb_dequeue(&dat->q); dat_queue = netdev_get_tx_queue(dat->m->dev, 0); + q = rcu_dereference_bh(dat_queue->qdisc); + if (skb == NULL) { - struct net_device *m = qdisc_dev(dat_queue->qdisc); + struct net_device *m = qdisc_dev(q); if (m) { dat->m->slaves = sch; netif_wake_queue(m); @@ -108,7 +111,7 @@ teql_dequeue(struct Qdisc *sch) } else { qdisc_bstats_update(sch, skb); } - sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; + sch->q.qlen = dat->q.qlen + q->q.qlen; return skb; } @@ -157,9 +160,9 @@ teql_destroy(struct Qdisc *sch) txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; - root_lock = qdisc_root_sleeping_lock(txq->qdisc); + root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc)); spin_lock_bh(root_lock); - qdisc_reset(txq->qdisc); + qdisc_reset(rtnl_dereference(txq->qdisc)); spin_unlock_bh(root_lock); } } @@ -266,7 +269,7 @@ static inline int teql_resolve(struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); int res; - if (txq->qdisc == &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) == &noop_qdisc) return -ENODEV; if (!dev->header_ops || !dst) -- cgit v1.2.3 From 25d8c0d55f241ce2d360df1bea48e23a55836ee6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:05:27 -0700 Subject: net: rcu-ify tcf_proto rcu'ify tcf_proto this allows calling tc_classify() without holding any locks. Updaters are protected by RTNL. This patch prepares the core net_sched infrastracture for running the classifier/action chains without holding the qdisc lock however it does nothing to ensure cls_xxx and act_xxx types also work without locking. Additional patches are required to address the fall out. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 +++++---- net/sched/cls_api.c | 30 +++++++++++++++--------------- net/sched/sch_api.c | 10 +++++----- net/sched/sch_atm.c | 20 +++++++++++--------- net/sched/sch_cbq.c | 11 +++++++---- net/sched/sch_choke.c | 15 +++++++++------ net/sched/sch_drr.c | 9 ++++++--- net/sched/sch_dsmark.c | 9 +++++---- net/sched/sch_fq_codel.c | 11 +++++++---- net/sched/sch_hfsc.c | 8 ++++---- net/sched/sch_htb.c | 15 ++++++++------- net/sched/sch_ingress.c | 8 +++++--- net/sched/sch_multiq.c | 8 +++++--- net/sched/sch_prio.c | 11 +++++++---- net/sched/sch_qfq.c | 9 ++++++--- net/sched/sch_sfb.c | 15 +++++++++------ net/sched/sch_sfq.c | 11 +++++++---- 17 files changed, 121 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 56838ab29b42..1e89b9ad3a4c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -143,7 +143,7 @@ struct Qdisc_class_ops { void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ - struct tcf_proto ** (*tcf_chain)(struct Qdisc *, unsigned long); + struct tcf_proto __rcu ** (*tcf_chain)(struct Qdisc *, unsigned long); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); @@ -212,8 +212,8 @@ struct tcf_proto_ops { struct tcf_proto { /* Fast access part */ - struct tcf_proto *next; - void *root; + struct tcf_proto __rcu *next; + void __rcu *root; int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); @@ -225,6 +225,7 @@ struct tcf_proto { struct Qdisc *q; void *data; const struct tcf_proto_ops *ops; + struct rcu_head rcu; }; struct qdisc_skb_cb { @@ -378,7 +379,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); void tcf_destroy(struct tcf_proto *tp); -void tcf_destroy_chain(struct tcf_proto **fl); +void tcf_destroy_chain(struct tcf_proto __rcu **fl); /* Reset all TX qdiscs greater then index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c28b0d327b12..e547efdaba9d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -117,7 +117,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; - spinlock_t *root_lock; struct tcmsg *t; u32 protocol; u32 prio; @@ -125,7 +124,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) u32 parent; struct net_device *dev; struct Qdisc *q; - struct tcf_proto **back, **chain; + struct tcf_proto __rcu **back; + struct tcf_proto __rcu **chain; struct tcf_proto *tp; const struct tcf_proto_ops *tp_ops; const struct Qdisc_class_ops *cops; @@ -197,7 +197,9 @@ replay: goto errout; /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; (tp = *back) != NULL; back = &tp->next) { + for (back = chain; + (tp = rtnl_dereference(*back)) != NULL; + back = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { if (!nprio || @@ -209,8 +211,6 @@ replay: } } - root_lock = qdisc_root_sleeping_lock(q); - if (tp == NULL) { /* Proto-tcf does not exist, create new one */ @@ -259,7 +259,8 @@ replay: } tp->ops = tp_ops; tp->protocol = protocol; - tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back)); + tp->prio = nprio ? : + TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); tp->q = q; tp->classify = tp_ops->classify; tp->classid = parent; @@ -280,9 +281,9 @@ replay: if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - spin_lock_bh(root_lock); - *back = tp->next; - spin_unlock_bh(root_lock); + struct tcf_proto *next = rtnl_dereference(tp->next); + + RCU_INIT_POINTER(*back, next); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); tcf_destroy(tp); @@ -322,10 +323,8 @@ replay: n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { if (tp_created) { - spin_lock_bh(root_lock); - tp->next = *back; - *back = tp; - spin_unlock_bh(root_lock); + RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); + rcu_assign_pointer(*back, tp); } tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); } else { @@ -420,7 +419,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) int s_t; struct net_device *dev; struct Qdisc *q; - struct tcf_proto *tp, **chain; + struct tcf_proto *tp, __rcu **chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; @@ -454,7 +453,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tp = *chain, t = 0; tp; tp = tp->next, t++) { + for (tp = rtnl_dereference(*chain), t = 0; + tp; tp = rtnl_dereference(tp->next), t++) { if (t < s_t) continue; if (TC_H_MAJ(tcm->tcm_info) && diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 58bed7599db7..ca6248345937 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1781,7 +1781,7 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, __be16 protocol = skb->protocol; int err; - for (; tp; tp = tp->next) { + for (; tp; tp = rcu_dereference_bh(tp->next)) { if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; @@ -1833,15 +1833,15 @@ void tcf_destroy(struct tcf_proto *tp) { tp->ops->destroy(tp); module_put(tp->ops->owner); - kfree(tp); + kfree_rcu(tp, rcu); } -void tcf_destroy_chain(struct tcf_proto **fl) +void tcf_destroy_chain(struct tcf_proto __rcu **fl) { struct tcf_proto *tp; - while ((tp = *fl) != NULL) { - *fl = tp->next; + while ((tp = rtnl_dereference(*fl)) != NULL) { + RCU_INIT_POINTER(*fl, tp->next); tcf_destroy(tp); } } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 8449b337f9e3..c398f9c3dbdd 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -41,7 +41,7 @@ struct atm_flow_data { struct Qdisc *q; /* FIFO, TBF, etc. */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); /* chaining */ @@ -273,7 +273,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, error = -ENOBUFS; goto err_out; } - flow->filter_list = NULL; + RCU_INIT_POINTER(flow->filter_list, NULL); flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!flow->q) flow->q = &noop_qdisc; @@ -311,7 +311,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); if (list_empty(&flow->list)) return -EINVAL; - if (flow->filter_list || flow == &p->link) + if (rcu_access_pointer(flow->filter_list) || flow == &p->link) return -EBUSY; /* * Reference count must be 2: one for "keepalive" (set at class @@ -345,7 +345,8 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; @@ -369,11 +370,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow = NULL; if (TC_H_MAJ(skb->priority) != sch->handle || !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { + struct tcf_proto *fl; + list_for_each_entry(flow, &p->flows, list) { - if (flow->filter_list) { - result = tc_classify_compat(skb, - flow->filter_list, - &res); + fl = rcu_dereference_bh(flow->filter_list); + if (fl) { + result = tc_classify_compat(skb, fl, &res); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; @@ -544,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - p->link.filter_list = NULL; + RCU_INIT_POINTER(p->link.filter_list, NULL); p->link.vcc = NULL; p->link.sock = NULL; p->link.classid = sch->handle; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 762a04bb8f6d..a3244a800501 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -133,7 +133,7 @@ struct cbq_class { struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; int refcnt; int filters; @@ -221,6 +221,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct cbq_class **defmap; struct cbq_class *cl = NULL; u32 prio = skb->priority; + struct tcf_proto *fl; struct tcf_result res; /* @@ -235,11 +236,12 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int result = 0; defmap = head->defaults; + fl = rcu_dereference_bh(head->filter_list); /* * Step 2+n. Apply classifier. */ - if (!head->filter_list || - (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) + result = tc_classify_compat(skb, fl, &res); + if (!fl || result < 0) goto fallback; cl = (void *)res.class; @@ -1954,7 +1956,8 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, + unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ed30e436128b..74813e6b6ff6 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -57,7 +57,7 @@ struct choke_sched_data { /* Variables */ struct red_vars vars; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ @@ -193,9 +193,11 @@ static bool choke_classify(struct sk_buff *skb, { struct choke_sched_data *q = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl; int result; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -249,7 +251,7 @@ static bool choke_match_random(const struct choke_sched_data *q, return false; oskb = choke_peek_random(q, pidx); - if (q->filter_list) + if (rcu_access_pointer(q->filter_list)) return choke_get_classid(nskb) == choke_get_classid(oskb); return choke_match_flow(oskb, nskb); @@ -257,11 +259,11 @@ static bool choke_match_random(const struct choke_sched_data *q, static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) { + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); const struct red_parms *p = &q->parms; - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - if (q->filter_list) { + if (rcu_access_pointer(q->filter_list)) { /* If using external classifiers, get result and record it. */ if (!choke_classify(skb, sch, &ret)) goto other_drop; /* Packet was eaten by filter */ @@ -554,7 +556,8 @@ static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, return 0; } -static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct choke_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 7bbbfe112192..d8b5ccfd248a 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,7 +35,7 @@ struct drr_class { struct drr_sched { struct list_head active; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc_class_hash clhash; }; @@ -184,7 +184,8 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg) drr_destroy_class(sch, cl); } -static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch, + unsigned long cl) { struct drr_sched *q = qdisc_priv(sch); @@ -319,6 +320,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { @@ -328,7 +330,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 49d6ef338b55..485e456c8139 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -37,7 +37,7 @@ struct dsmark_qdisc_data { struct Qdisc *q; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; u8 *mask; /* "owns" the array */ u8 *value; u16 indices; @@ -186,8 +186,8 @@ ignore: } } -static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch, - unsigned long cl) +static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct dsmark_qdisc_data *p = qdisc_priv(sch); return &p->filter_list; @@ -229,7 +229,8 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = TC_H_MIN(skb->priority); else { struct tcf_result res; - int result = tc_classify(skb, p->filter_list, &res); + struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); + int result = tc_classify(skb, fl, &res); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index cc56c8bb9bed..105cf5557630 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -52,7 +52,7 @@ struct fq_codel_flow { }; /* please try to keep this structure <= 64 bytes */ struct fq_codel_sched_data { - struct tcf_proto *filter_list; /* optional external classifier */ + struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ @@ -85,6 +85,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; struct tcf_result res; int result; @@ -93,11 +94,12 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->flows_cnt) return TC_H_MIN(skb->priority); - if (!q->filter_list) + filter = rcu_dereference(q->filter_list); + if (!filter) return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, filter, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -496,7 +498,8 @@ static void fq_codel_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct fq_codel_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ec8aeaac1dd7..04b0de4c68b5 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,7 @@ struct hfsc_class { struct gnet_stats_queue qstats; struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ - struct tcf_proto *filter_list; /* filter list */ + struct tcf_proto __rcu *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ struct hfsc_sched *sched; /* scheduler data */ @@ -1161,7 +1161,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; - tcf = q->root.filter_list; + tcf = rcu_dereference_bh(q->root.filter_list); while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1185,7 +1185,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return cl; /* hit leaf class */ /* apply inner filter chain */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); head = cl; } @@ -1285,7 +1285,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) cl->filter_cnt--; } -static struct tcf_proto ** +static struct tcf_proto __rcu ** hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) { struct hfsc_sched *q = qdisc_priv(sch); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index aea942ce6008..6d16b9b81c28 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -103,7 +103,7 @@ struct htb_class { u32 prio; /* these two are used only by leaves... */ int quantum; /* but stored for parent-to-leaf return */ - struct tcf_proto *filter_list; /* class attached filters */ + struct tcf_proto __rcu *filter_list; /* class attached filters */ int filter_cnt; int refcnt; /* usage count of this class */ @@ -153,7 +153,7 @@ struct htb_sched { int rate2quantum; /* quant = rate / rate2quantum */ /* filters for qdisc itself */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; #define HTB_WARN_TOOMANYEVENTS 0x1 unsigned int warned; /* only one warning */ @@ -223,9 +223,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, if (cl->level == 0) return cl; /* Start with inner filter chain if a non-leaf class is selected */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); } else { - tcf = q->filter_list; + tcf = rcu_dereference_bh(q->filter_list); } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; @@ -251,7 +251,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, return cl; /* we hit leaf; return it */ /* we have got inner class; apply inner filter chain */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); } /* classification failed; try to use default class */ cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); @@ -1519,11 +1519,12 @@ failure: return err; } -static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch, + unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list; return fl; } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 62871c14e1f9..b351125f3849 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -17,7 +17,7 @@ struct ingress_qdisc_data { - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; }; /* ------------------------- Class/flow operations ------------------------- */ @@ -46,7 +46,8 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } -static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct ingress_qdisc_data *p = qdisc_priv(sch); @@ -59,9 +60,10 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct ingress_qdisc_data *p = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); int result; - result = tc_classify(skb, p->filter_list, &res); + result = tc_classify(skb, fl, &res); qdisc_bstats_update(sch, skb); switch (result) { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index afb050a735fa..c0466c1840f3 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -31,7 +31,7 @@ struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc **queues; }; @@ -42,10 +42,11 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; + struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, q->filter_list, &res); + err = tc_classify(skb, fl, &res); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -388,7 +389,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct multiq_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 79359b69ad8d..03ef99e52a5c 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -24,7 +24,7 @@ struct prio_sched_data { int bands; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; @@ -36,11 +36,13 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; + struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { - err = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + err = tc_classify(skb, fl, &res); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -50,7 +52,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return NULL; } #endif - if (!q->filter_list || err < 0) { + if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; @@ -351,7 +353,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8056fb4e618a..602ea01a4ddd 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -181,7 +181,7 @@ struct qfq_group { }; struct qfq_sched { - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ @@ -576,7 +576,8 @@ static void qfq_put_class(struct Qdisc *sch, unsigned long arg) qfq_destroy_class(sch, cl); } -static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch, + unsigned long cl) { struct qfq_sched *q = qdisc_priv(sch); @@ -704,6 +705,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { @@ -714,7 +716,8 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 9b0f7093d970..1562fb2b3f46 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -55,7 +55,7 @@ struct sfb_bins { struct sfb_sched_data { struct Qdisc *qdisc; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; @@ -253,13 +253,13 @@ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) return false; } -static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, +static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, int *qerr, u32 *salt) { struct tcf_result res; int result; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; + struct tcf_proto *fl; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -306,9 +307,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - if (q->filter_list) { + fl = rcu_dereference_bh(q->filter_list); + if (fl) { /* If using external classifiers, get result and record it. */ - if (!sfb_classify(skb, q, &ret, &salt)) + if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; keys.src = salt; keys.dst = 0; @@ -660,7 +662,8 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct sfb_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 211db9017c35..80c36bd54abc 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -125,7 +125,7 @@ struct sfq_sched_data { u8 cur_depth; /* depth of longest slot */ u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ @@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, { struct sfq_sched_data *q = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority) == sch->handle && @@ -194,13 +195,14 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); - if (!q->filter_list) { + fl = rcu_dereference_bh(q->filter_list); + if (!fl) { skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); return sfq_hash(q, skb) + 1; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -836,7 +838,8 @@ static void sfq_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); -- cgit v1.2.3 From 331b72922c5f58d48fd5500acadc91777cc31970 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:08:20 -0700 Subject: net: sched: RCU cls_tcindex Make cls_tcindex RCU safe. This patch addds a new RCU routine rcu_dereference_bh_rtnl() to check caller either holds the rcu read lock or RTNL. This is needed to handle the case where tcindex_lookup() is being called in both cases. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 10 ++ net/sched/cls_tcindex.c | 248 ++++++++++++++++++++++++++++------------------ 2 files changed, 164 insertions(+), 94 deletions(-) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 167bae7bdfa4..6cacbce1a06c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -46,6 +46,16 @@ static inline int lockdep_rtnl_is_held(void) #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) +/** + * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking + * @p: The pointer to read, prior to dereference + * + * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh() + * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh() + */ +#define rcu_dereference_bh_rtnl(p) \ + rcu_dereference_bh_check(p, lockdep_rtnl_is_held()) + /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 3e9f76413b3b..a9f4279fbd69 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -32,19 +32,21 @@ struct tcindex_filter_result { struct tcindex_filter { u16 key; struct tcindex_filter_result result; - struct tcindex_filter *next; + struct tcindex_filter __rcu *next; + struct rcu_head rcu; }; struct tcindex_data { struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter **h; /* imperfect hash; only used if !perfect; - NULL if unused */ + struct tcindex_filter __rcu **h; /* imperfect hash; */ + struct tcf_proto *tp; u16 mask; /* AND key with mask */ - int shift; /* shift ANDed key to the right */ - int hash; /* hash table size; 0 if undefined */ - int alloc_hash; /* allocated size */ - int fall_through; /* 0: only classify if explicit match */ + u32 shift; /* shift ANDed key to the right */ + u32 hash; /* hash table size; 0 if undefined */ + u32 alloc_hash; /* allocated size */ + u32 fall_through; /* 0: only classify if explicit match */ + struct rcu_head rcu; }; static inline int @@ -56,13 +58,18 @@ tcindex_filter_is_set(struct tcindex_filter_result *r) static struct tcindex_filter_result * tcindex_lookup(struct tcindex_data *p, u16 key) { - struct tcindex_filter *f; + if (p->perfect) { + struct tcindex_filter_result *f = p->perfect + key; + + return tcindex_filter_is_set(f) ? f : NULL; + } else if (p->h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *f; - if (p->perfect) - return tcindex_filter_is_set(p->perfect + key) ? - p->perfect + key : NULL; - else if (p->h) { - for (f = p->h[key % p->hash]; f; f = f->next) + fp = &p->h[key % p->hash]; + for (f = rcu_dereference_bh_rtnl(*fp); + f; + fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) if (f->key == key) return &f->result; } @@ -74,7 +81,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rcu_dereference(tp->root); struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; @@ -99,7 +106,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r; pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -129,49 +136,59 @@ static int tcindex_init(struct tcf_proto *tp) p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; - tp->root = p; + rcu_assign_pointer(tp->root, p); return 0; } - static int -__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +tcindex_delete(struct tcf_proto *tp, unsigned long arg) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter __rcu **walk; struct tcindex_filter *f = NULL; - pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p); if (p->perfect) { if (!r->res.class) return -ENOENT; } else { int i; - struct tcindex_filter **walk = NULL; - for (i = 0; i < p->hash; i++) - for (walk = p->h+i; *walk; walk = &(*walk)->next) - if (&(*walk)->result == r) + for (i = 0; i < p->hash; i++) { + walk = p->h + i; + for (f = rtnl_dereference(*walk); f; + walk = &f->next, f = rtnl_dereference(*walk)) { + if (&f->result == r) goto found; + } + } return -ENOENT; found: - f = *walk; - if (lock) - tcf_tree_lock(tp); - *walk = f->next; - if (lock) - tcf_tree_unlock(tp); + rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); tcf_exts_destroy(tp, &r->exts); - kfree(f); + if (f) + kfree_rcu(f, rcu); return 0; } -static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, + struct tcf_walker *walker) { - return __tcindex_delete(tp, arg, 1); + return tcindex_delete(tp, arg); +} + +static void __tcindex_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p->h); + kfree(p); } static inline int @@ -194,6 +211,14 @@ static void tcindex_filter_result_init(struct tcindex_filter_result *r) tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } +static void __tcindex_partial_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p); +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, @@ -203,7 +228,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data cp; + struct tcindex_data *cp, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; @@ -212,84 +237,118 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - memcpy(&cp, p, sizeof(cp)); - tcindex_filter_result_init(&new_filter_result); + /* tcindex_data attributes must look atomic to classifier/lookup so + * allocate new tcindex data and RCU assign it onto root. Keeping + * perfect hash and hash pointers from old data. + */ + cp = kzalloc(sizeof(cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->mask = p->mask; + cp->shift = p->shift; + cp->hash = p->hash; + cp->alloc_hash = p->alloc_hash; + cp->fall_through = p->fall_through; + cp->tp = tp; + + if (p->perfect) { + cp->perfect = kmemdup(p->perfect, + sizeof(*r) * cp->hash, GFP_KERNEL); + if (!cp->perfect) + goto errout; + } + cp->h = p->h; + + memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcf_exts_init(&new_filter_result.exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); tcindex_filter_result_init(&cr); if (old_r) cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) - cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); if (tb[TCA_TCINDEX_MASK]) - cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); if (tb[TCA_TCINDEX_SHIFT]) - cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); err = -EBUSY; + /* Hash already allocated, make sure that we still meet the * requirements for the allocated hash. */ - if (cp.perfect) { - if (!valid_perfect_hash(&cp) || - cp.hash > cp.alloc_hash) + if (cp->perfect) { + if (!valid_perfect_hash(cp) || + cp->hash > cp->alloc_hash) goto errout; - } else if (cp.h && cp.hash != cp.alloc_hash) + } else if (cp->h && cp->hash != cp->alloc_hash) { goto errout; + } err = -EINVAL; if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); + cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - if (!cp.hash) { + if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. */ - if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) - cp.hash = (cp.mask >> cp.shift) + 1; + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) + cp->hash = (cp->mask >> cp->shift) + 1; else - cp.hash = DEFAULT_HASH_SIZE; + cp->hash = DEFAULT_HASH_SIZE; } - if (!cp.perfect && !cp.h) - cp.alloc_hash = cp.hash; + if (!cp->perfect && cp->h) + cp->alloc_hash = cp->hash; /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) * but then, we'd fail handles that may become valid after some future * mask change. While this is extremely unlikely to ever matter, * the check below is safer (and also more backwards-compatible). */ - if (cp.perfect || valid_perfect_hash(&cp)) - if (handle >= cp.alloc_hash) + if (cp->perfect || valid_perfect_hash(cp)) + if (handle >= cp->alloc_hash) goto errout; err = -ENOMEM; - if (!cp.perfect && !cp.h) { - if (valid_perfect_hash(&cp)) { + if (!cp->perfect && !cp->h) { + if (valid_perfect_hash(cp)) { int i; - cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); - if (!cp.perfect) + cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); + if (!cp->perfect) goto errout; - for (i = 0; i < cp.hash; i++) - tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + for (i = 0; i < cp->hash; i++) + tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); balloc = 1; } else { - cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); - if (!cp.h) + struct tcindex_filter __rcu **hash; + + hash = kcalloc(cp->hash, + sizeof(struct tcindex_filter *), + GFP_KERNEL); + + if (!hash) goto errout; + + cp->h = hash; balloc = 2; } } - if (cp.perfect) - r = cp.perfect + handle; + if (cp->perfect) + r = cp->perfect + handle; else - r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + r = tcindex_lookup(cp, handle) ? : &new_filter_result; if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -307,33 +366,41 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - tcf_tree_lock(tp); if (old_r && old_r != r) tcindex_filter_result_init(old_r); - memcpy(p, &cp, sizeof(cp)); + oldp = p; r->res = cr.res; + rcu_assign_pointer(tp->root, cp); if (r == &new_filter_result) { - struct tcindex_filter **fp; + struct tcindex_filter *nfp; + struct tcindex_filter __rcu **fp; f->key = handle; f->result = new_filter_result; f->next = NULL; - for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) - /* nothing */; - *fp = f; + + fp = p->h + (handle % p->hash); + for (nfp = rtnl_dereference(*fp); + nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) + ; /* nothing */ + + rcu_assign_pointer(*fp, f); } - tcf_tree_unlock(tp); + if (oldp) + call_rcu(&oldp->rcu, __tcindex_partial_destroy); return 0; errout_alloc: if (balloc == 1) - kfree(cp.perfect); + kfree(cp->perfect); else if (balloc == 2) - kfree(cp.h); + kfree(cp->h); errout: + kfree(cp); tcf_exts_destroy(tp, &e); return err; } @@ -345,7 +412,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; int err; @@ -364,10 +431,9 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, tca[TCA_RATE], ovr); } - static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter *f, *next; int i; @@ -390,8 +456,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) if (!p->h) return; for (i = 0; i < p->hash; i++) { - for (f = p->h[i]; f; f = next) { - next = f->next; + for (f = rtnl_dereference(p->h[i]); f; f = next) { + next = rtnl_dereference(f->next); if (walker->count >= walker->skip) { if (walker->fn(tp, (unsigned long) &f->result, walker) < 0) { @@ -404,17 +470,9 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } - -static int tcindex_destroy_element(struct tcf_proto *tp, - unsigned long arg, struct tcf_walker *walker) -{ - return __tcindex_delete(tp, arg, 0); -} - - static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -422,17 +480,16 @@ static void tcindex_destroy(struct tcf_proto *tp) walker.skip = 0; walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); - kfree(p->perfect); - kfree(p->h); - kfree(p); - tp->root = NULL; + + RCU_INIT_POINTER(tp->root, NULL); + call_rcu(&p->rcu, __tcindex_destroy); } static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -455,15 +512,18 @@ static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_nest_end(skb, nest); } else { if (p->perfect) { - t->tcm_handle = r-p->perfect; + t->tcm_handle = r - p->perfect; } else { struct tcindex_filter *f; + struct tcindex_filter __rcu **fp; int i; t->tcm_handle = 0; for (i = 0; !t->tcm_handle && i < p->hash; i++) { - for (f = p->h[i]; !t->tcm_handle && f; - f = f->next) { + fp = &p->h[i]; + for (f = rtnl_dereference(*fp); + !t->tcm_handle && f; + fp = &f->next, f = rtnl_dereference(*fp)) { if (&f->result == r) t->tcm_handle = f->key; } -- cgit v1.2.3 From 6c555490e0ce885a9caf0a045db69382a3ccbc9c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:09 -0700 Subject: ipv6: drop useless rcu_read_lock() in anycast These code is now protected by rtnl lock, rcu read lock is useless now. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/dev.c | 14 ++++++++------ net/ipv6/anycast.c | 18 ++++++------------ 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ae721f53739e..ee38b948d9a0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2083,8 +2083,8 @@ void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags, - unsigned short mask); +struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, + unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index b3d6dbc0c696..e916ba8caccf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -897,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags + * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * rtnl_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; + ASSERT_RTNL(); + ret = NULL; - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; @@ -921,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags } return ret; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index ff2de7d9d8e6..3b0429bc6b5a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -78,7 +78,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac->acl_addr = *addr; rtnl_lock(); - rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -91,11 +90,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } else { /* router, no matching interface: just pick one */ - dev = dev_get_by_flags_rcu(net, IFF_UP, - IFF_UP | IFF_LOOPBACK); + dev = __dev_get_by_flags(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index_rcu(net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) { err = -ENODEV; @@ -137,7 +136,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } error: - rcu_read_unlock(); rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); @@ -174,11 +172,9 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock_bh(&ipv6_sk_ac_lock); rtnl_lock(); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - rcu_read_unlock(); rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); @@ -203,12 +199,11 @@ void ipv6_sock_ac_close(struct sock *sk) prev_index = 0; rtnl_lock(); - rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -216,7 +211,6 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - rcu_read_unlock(); rtnl_unlock(); } @@ -341,7 +335,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } -/* called with rcu_read_lock() */ +/* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); -- cgit v1.2.3 From 013b4d90387a5dca54281263e0d4650db97bd67c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:11 -0700 Subject: ipv6: clean up ipv6_dev_ac_inc() Make it accept inet6_dev, and rename it to __ipv6_dev_ac_inc() to reflect this change. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/anycast.c | 13 ++++--------- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f679877bb601..9b1d42e66cca 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -202,7 +202,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void ipv6_sock_ac_close(struct sock *sk); -int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr); +int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ad4598fcc416..6b6a373a30b2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1725,7 +1725,7 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; - ipv6_dev_ac_inc(ifp->idev->dev, &addr); + __ipv6_dev_ac_inc(ifp->idev, &addr); } /* caller must hold RTNL */ diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index d10f2e2b49c6..66c19320119a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -122,7 +122,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } - err = ipv6_dev_ac_inc(dev, addr); + err = __ipv6_dev_ac_inc(idev, addr); if (!err) { pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; @@ -215,20 +215,15 @@ static void aca_put(struct ifacaddr6 *ac) /* * device anycast group inc (add if not found) */ -int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) +int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct inet6_dev *idev; struct rt6_info *rt; int err; ASSERT_RTNL(); - idev = in6_dev_get(dev); - - if (idev == NULL) - return -EINVAL; - + in6_dev_hold(idev); write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; @@ -276,7 +271,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) ip6_ins_rt(rt); - addrconf_join_solict(dev, &aca->aca_addr); + addrconf_join_solict(idev->dev, &aca->aca_addr); aca_put(aca); return 0; -- cgit v1.2.3 From ac7a04c33dd7f8e429df4b929ba3a3e8e729cc89 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 11 Sep 2014 21:18:09 -0700 Subject: net: dsa: change tag_protocol to an enum Now that we introduced an additional multiplexing/demultiplexing layer with commit 3e8a72d1dae37 ("net: dsa: reduce number of protocol hooks") that lives within the DSA code, we no longer need to have a given switch driver tag_protocol be an actual ethertype value, instead, we can replace it with an enum: dsa_tag_protocol. Do this replacement in the drivers, which allows us to get rid of the cpu_to_be16()/htons() dance, and remove ETH_P_BRCMTAG since we do not need it anymore. Suggested-by: Alexander Duyck Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 2 +- drivers/net/dsa/mv88e6060.c | 2 +- drivers/net/dsa/mv88e6123_61_65.c | 4 ++-- drivers/net/dsa/mv88e6131.c | 2 +- include/net/dsa.h | 17 ++++++++++------- net/dsa/slave.c | 8 ++++---- net/dsa/tag_brcm.c | 1 - 7 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index bb7cb8e283b1..e9918c7f1792 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -592,7 +592,7 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port, } static struct dsa_switch_driver bcm_sf2_switch_driver = { - .tag_protocol = htons(ETH_P_BRCMTAG), + .tag_protocol = DSA_TAG_PROTO_BRCM, .priv_size = sizeof(struct bcm_sf2_priv), .probe = bcm_sf2_sw_probe, .setup = bcm_sf2_sw_setup, diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 7a54ec04b418..d8037c1055ac 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -258,7 +258,7 @@ static void mv88e6060_poll_link(struct dsa_switch *ds) } static struct dsa_switch_driver mv88e6060_switch_driver = { - .tag_protocol = htons(ETH_P_TRAILER), + .tag_protocol = DSA_TAG_PROTO_TRAILER, .probe = mv88e6060_probe, .setup = mv88e6060_setup, .set_addr = mv88e6060_set_addr, diff --git a/drivers/net/dsa/mv88e6123_61_65.c b/drivers/net/dsa/mv88e6123_61_65.c index 69c42513dd72..975774f338c2 100644 --- a/drivers/net/dsa/mv88e6123_61_65.c +++ b/drivers/net/dsa/mv88e6123_61_65.c @@ -207,7 +207,7 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) */ val = 0x0433; if (dsa_is_cpu_port(ds, p)) { - if (ds->dst->tag_protocol == htons(ETH_P_EDSA)) + if (ds->dst->tag_protocol == DSA_TAG_PROTO_EDSA) val |= 0x3300; else val |= 0x0100; @@ -391,7 +391,7 @@ static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds) } struct dsa_switch_driver mv88e6123_61_65_switch_driver = { - .tag_protocol = cpu_to_be16(ETH_P_EDSA), + .tag_protocol = DSA_TAG_PROTO_EDSA, .priv_size = sizeof(struct mv88e6xxx_priv_state), .probe = mv88e6123_61_65_probe, .setup = mv88e6123_61_65_setup, diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c index 953bc6a49e59..35541f2ceca3 100644 --- a/drivers/net/dsa/mv88e6131.c +++ b/drivers/net/dsa/mv88e6131.c @@ -379,7 +379,7 @@ static int mv88e6131_get_sset_count(struct dsa_switch *ds) } struct dsa_switch_driver mv88e6131_switch_driver = { - .tag_protocol = cpu_to_be16(ETH_P_DSA), + .tag_protocol = DSA_TAG_PROTO_DSA, .priv_size = sizeof(struct mv88e6xxx_priv_state), .probe = mv88e6131_probe, .setup = mv88e6131_setup, diff --git a/include/net/dsa.h b/include/net/dsa.h index 97712927a9d2..8a8a5d976f97 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -19,10 +19,13 @@ #include #include -/* Not an official ethertype value, used only internally for DSA - * demultiplexing - */ -#define ETH_P_BRCMTAG (ETH_P_XDSA + 1) +enum dsa_tag_protocol { + DSA_TAG_PROTO_NONE = 0, + DSA_TAG_PROTO_DSA, + DSA_TAG_PROTO_TRAILER, + DSA_TAG_PROTO_EDSA, + DSA_TAG_PROTO_BRCM, +}; #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -89,7 +92,7 @@ struct dsa_switch_tree { */ struct net_device *master_netdev; const struct dsa_device_ops *ops; - __be16 tag_protocol; + enum dsa_tag_protocol tag_protocol; /* * The switch and port to which the CPU is attached. @@ -166,7 +169,7 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) struct dsa_switch_driver { struct list_head list; - __be16 tag_protocol; + enum dsa_tag_protocol tag_protocol; int priv_size; /* @@ -215,7 +218,7 @@ static inline void *ds_to_priv(struct dsa_switch *ds) static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { - return dst->tag_protocol != 0; + return dst->tag_protocol != DSA_TAG_PROTO_NONE; } #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7333a4aebb7d..809eeb13eb12 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -437,22 +437,22 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA - case htons(ETH_P_DSA): + case DSA_TAG_PROTO_DSA: ds->dst->ops = &dsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA - case htons(ETH_P_EDSA): + case DSA_TAG_PROTO_EDSA: ds->dst->ops = &edsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER - case htons(ETH_P_TRAILER): + case DSA_TAG_PROTO_TRAILER: ds->dst->ops = &trailer_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_BRCM - case htons(ETH_P_BRCMTAG): + case DSA_TAG_PROTO_BRCM: ds->dst->ops = &brcm_netdev_ops; break; #endif diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e0b759ec209e..8fbc21c0de78 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -91,7 +91,6 @@ static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ - skb->protocol = htons(ETH_P_BRCMTAG); skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); -- cgit v1.2.3 From 233577a22089facf5271ab5e845b2262047c971f Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 12 Sep 2014 14:04:43 +0200 Subject: net: filter: constify detection of pkt_type_offset Currently we have 2 pkt_type_offset functions doing the same thing and spread across the architecture files. Remove those and replace them with a PKT_TYPE_OFFSET macro helper which gets the constant value from a zero sized sk_buff member right in front of the bitfield with offsetof. This new offset marker does not change size of struct sk_buff. Cc: Eric Dumazet Cc: Markos Chandras Cc: Martin Schwidefsky Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Denis Kirjanov Signed-off-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/mips/net/bpf_jit.c | 27 +-------------------------- arch/s390/net/bpf_jit_comp.c | 35 +---------------------------------- include/linux/skbuff.h | 10 ++++++++++ net/core/filter.c | 31 ++----------------------------- 4 files changed, 14 insertions(+), 89 deletions(-) (limited to 'include') diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0e97ccd29fe3..7edc08398c4a 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -765,27 +765,6 @@ static u64 jit_get_skb_w(struct sk_buff *skb, unsigned offset) return (u64)err << 32 | ntohl(ret); } -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - u8 *ct = (u8 *)&skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n"); - return -1; -} - static int build_body(struct jit_ctx *ctx) { void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; @@ -1332,11 +1311,7 @@ jmp_cmp: case BPF_ANC | SKF_AD_PKTTYPE: ctx->flags |= SEEN_SKB; - off = pkt_type_offset(); - - if (off < 0) - return -1; - emit_load_byte(r_tmp, r_skb, off, ctx); + emit_load_byte(r_tmp, r_skb, PKT_TYPE_OFFSET(), ctx); /* Keep only the last 3 bits */ emit_andi(r_A, r_tmp, PKT_TYPE_MAX, ctx); #ifdef __BIG_ENDIAN_BITFIELD diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 555f5c7e83ab..c52ac77408ca 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -227,37 +227,6 @@ static void bpf_jit_epilogue(struct bpf_jit *jit) EMIT2(0x07fe); } -/* Helper to find the offset of pkt_type in sk_buff - * Make sure its still a 3bit field starting at the MSBs within a byte. - */ -#define PKT_TYPE_MAX 0xe0 -static int pkt_type_offset; - -static int __init bpf_pkt_type_offset_init(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - char *ct = (char *)&skb_probe; - int off; - - pkt_type_offset = -1; - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (!ct[off]) - continue; - if (ct[off] == PKT_TYPE_MAX) - pkt_type_offset = off; - else { - /* Found non matching bit pattern, fix needed. */ - WARN_ON_ONCE(1); - pkt_type_offset = -1; - return -1; - } - } - return 0; -} -device_initcall(bpf_pkt_type_offset_init); - /* * make sure we dont leak kernel information to user */ @@ -757,12 +726,10 @@ call_fn: /* lg %r1,(%r13) */ } break; case BPF_ANC | SKF_AD_PKTTYPE: - if (pkt_type_offset < 0) - goto out; /* lhi %r5,0 */ EMIT4(0xa7580000); /* ic %r5,(%r2) */ - EMIT4_DISP(0x43502000, pkt_type_offset); + EMIT4_DISP(0x43502000, PKT_TYPE_OFFSET()); /* srl %r5,5 */ EMIT4_DISP(0x88500000, 5); break; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..756e3d057e84 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -548,6 +548,16 @@ struct sk_buff { ip_summed:2, nohdr:1, nfctinfo:3; + +/* if you move pkt_type around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 +#endif +#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) + + __u8 __pkt_type_offset[0]; __u8 pkt_type:3, fclone:2, ipvs_property:1, diff --git a/net/core/filter.c b/net/core/filter.c index dfc716ffa44b..601f28de7311 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,30 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Helper to find the offset of pkt_type in sk_buff structure. We want - * to make sure its still a 3bit field starting at a byte boundary; - * taken from arch/x86/net/bpf_jit_comp.c. - */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static unsigned int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { .pkt_type = ~0, }; - u8 *ct = (u8 *) &skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - - pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); - return -1; -} - static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { return skb_get_poff((struct sk_buff *)(unsigned long) ctx); @@ -190,11 +166,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - pkt_type_offset()); - if (insn->off < 0) - return false; - insn++; + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + PKT_TYPE_OFFSET()); *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD insn++; -- cgit v1.2.3 From 3fc8867740b4a0bf56f372c6f5ddd14970962fb1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 12 Sep 2014 23:12:46 -0700 Subject: netdevice: Support DSA tagging when DSA is built as a module This change corrects an error seen when DSA tagging is built as a module. Without this change it is not possible to get XDSA tagged frames as the test for tagging is stripped by the #ifdef check. Signed-off-by: Alexander Duyck Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ee38b948d9a0..f9e81d10a3b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1789,7 +1789,7 @@ void dev_net_set(struct net_device *dev, struct net *net) static inline bool netdev_uses_dsa(struct net_device *dev) { -#ifdef CONFIG_NET_DSA +#if IS_ENABLED(CONFIG_NET_DSA) if (dev->dsa_ptr != NULL) return dsa_uses_tagged_protocol(dev->dsa_ptr); #endif -- cgit v1.2.3 From 43e73e4e2ad05d9bf3b438cfbe1e71b57a85f26c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 14 Sep 2014 23:06:28 +0200 Subject: Bluetooth: Provide HCI command opcode information to driver The Bluetooth core already does processing of the HCI command header and puts it together before sending it to the driver. It is not really efficient for the driver to look at the HCI command header again in case it has to make certain decisions about certain commands. To make this easier, just provide the opcode as part of the SKB control buffer information. The extra information about the opcode is optional and only provided for HCI commands. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/hci_core.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 373000de610d..7e666d06b97f 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -284,6 +284,7 @@ struct hci_req_ctrl { struct bt_skb_cb { __u8 pkt_type; __u8 incoming; + __u16 opcode; __u16 expect; __u8 force_active; struct l2cap_chan *chan; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 067526d9680d..41948678f514 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4547,6 +4547,7 @@ static struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, BT_DBG("skb len %d", skb->len); bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->opcode = opcode; return skb; } -- cgit v1.2.3 From 0e9871e3f79fd17c691b50a9669220c54ff084a2 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:27 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support in the ipset core. Skbinfo extension provides mapping of metainformation with lookup in the ipset tables. This patch defines the flags, the constants, the functions and the structures for the data type independent support of the extension. Note the firewall mark stores in the kernel structures as two 32bit values, but transfered through netlink as one 64bit value. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 56 ++++++++++++++++++++++++++++- include/uapi/linux/netfilter/ipset/ip_set.h | 12 +++++++ net/netfilter/ipset/ip_set_core.c | 27 +++++++++++++- 3 files changed, 93 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 96afc29184be..b97aac5142ed 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -57,6 +57,8 @@ enum ip_set_extension { IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), + IPSET_EXT_BIT_SKBINFO = 3, + IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), @@ -65,12 +67,14 @@ enum ip_set_extension { #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) +#define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, + IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; @@ -92,6 +96,10 @@ struct ip_set_ext { u64 packets; u64 bytes; u32 timeout; + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; char *comment; }; @@ -104,6 +112,13 @@ struct ip_set_comment { char *str; }; +struct ip_set_skbinfo { + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; +}; + struct ip_set; #define ext_timeout(e, s) \ @@ -112,7 +127,8 @@ struct ip_set; (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]) #define ext_comment(e, s) \ (struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]) - +#define ext_skbinfo(e, s) \ +(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -256,6 +272,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; @@ -304,6 +322,39 @@ ip_set_update_counter(struct ip_set_counter *counter, } } +static inline void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbmark = skbinfo->skbmark; + mext->skbmarkmask = skbinfo->skbmarkmask; + mext->skbprio = skbinfo->skbprio; + mext->skbqueue = skbinfo->skbqueue; +} +static inline bool +ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) +{ + return nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask)) || + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio)) || + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue)); + +} + +static inline void +ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext) +{ + skbinfo->skbmark = ext->skbmark; + skbinfo->skbmarkmask = ext->skbmarkmask; + skbinfo->skbprio = ext->skbprio; + skbinfo->skbqueue = ext->skbqueue; +} + static inline bool ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter) { @@ -497,6 +548,9 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 78c2f2e79920..ca03119111a2 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -115,6 +115,9 @@ enum { IPSET_ATTR_BYTES, IPSET_ATTR_PACKETS, IPSET_ATTR_COMMENT, + IPSET_ATTR_SKBMARK, + IPSET_ATTR_SKBPRIO, + IPSET_ATTR_SKBQUEUE, __IPSET_ATTR_ADT_MAX, }; #define IPSET_ATTR_ADT_MAX (__IPSET_ATTR_ADT_MAX - 1) @@ -147,6 +150,7 @@ enum ipset_errno { IPSET_ERR_COUNTER, IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, + IPSET_ERR_SKBINFO, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, @@ -170,6 +174,12 @@ enum ipset_cmd_flags { IPSET_FLAG_MATCH_COUNTERS = (1 << IPSET_FLAG_BIT_MATCH_COUNTERS), IPSET_FLAG_BIT_RETURN_NOMATCH = 7, IPSET_FLAG_RETURN_NOMATCH = (1 << IPSET_FLAG_BIT_RETURN_NOMATCH), + IPSET_FLAG_BIT_MAP_SKBMARK = 8, + IPSET_FLAG_MAP_SKBMARK = (1 << IPSET_FLAG_BIT_MAP_SKBMARK), + IPSET_FLAG_BIT_MAP_SKBPRIO = 9, + IPSET_FLAG_MAP_SKBPRIO = (1 << IPSET_FLAG_BIT_MAP_SKBPRIO), + IPSET_FLAG_BIT_MAP_SKBQUEUE = 10, + IPSET_FLAG_MAP_SKBQUEUE = (1 << IPSET_FLAG_BIT_MAP_SKBQUEUE), IPSET_FLAG_CMD_MAX = 15, }; @@ -187,6 +197,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), IPSET_FLAG_BIT_WITH_FORCEADD = 5, IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), + IPSET_FLAG_BIT_WITH_SKBINFO = 6, + IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), IPSET_FLAG_CADT_MAX = 15, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4ca4e5ca6f57..26c795e6b57f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -337,6 +337,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [IPSET_EXT_ID_SKBINFO] = { + .type = IPSET_EXT_SKBINFO, + .flag = IPSET_FLAG_WITH_SKBINFO, + .len = sizeof(struct ip_set_skbinfo), + .align = __alignof__(struct ip_set_skbinfo), + }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, @@ -382,6 +388,7 @@ int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { + u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { if (!(set->extensions & IPSET_EXT_TIMEOUT)) return -IPSET_ERR_TIMEOUT; @@ -402,7 +409,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } - + if (tb[IPSET_ATTR_SKBMARK]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); + ext->skbmark = fullmark >> 32; + ext->skbmarkmask = fullmark & 0xffffffff; + } + if (tb[IPSET_ATTR_SKBPRIO]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbprio = be32_to_cpu(nla_get_be32( + tb[IPSET_ATTR_SKBPRIO])); + } + if (tb[IPSET_ATTR_SKBQUEUE]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbqueue = be16_to_cpu(nla_get_be16( + tb[IPSET_ATTR_SKBQUEUE])); + } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -- cgit v1.2.3 From 76cea4109ca89dea218fdc652d2e1535fd9b5fc7 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Tue, 2 Sep 2014 14:21:20 +0400 Subject: netfilter: ipset: Add skbinfo extension support to SET target. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/xt_set.h | 10 +++ net/netfilter/xt_set.c | 155 ++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h index 964d3d42f874..d6a1df1f2947 100644 --- a/include/uapi/linux/netfilter/xt_set.h +++ b/include/uapi/linux/netfilter/xt_set.h @@ -71,4 +71,14 @@ struct xt_set_info_match_v3 { __u32 flags; }; +/* Revision 3 target */ + +struct xt_set_info_target_v3 { + struct xt_set_info add_set; + struct xt_set_info del_set; + struct xt_set_info map_set; + __u32 flags; + __u32 timeout; +}; + #endif /*_XT_SET_H*/ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index cb70f6ec5695..5732cd64acc0 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -366,6 +366,140 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 target */ + +static unsigned int +set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + ADT_OPT(map_opt, par->family, info->map_set.dim, + info->map_set.flags, 0, UINT_MAX); + + int ret; + + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + if (info->map_set.index != IPSET_INVALID_ID) { + map_opt.cmdflags |= info->flags & (IPSET_FLAG_MAP_SKBMARK | + IPSET_FLAG_MAP_SKBPRIO | + IPSET_FLAG_MAP_SKBQUEUE); + ret = match_set(info->map_set.index, skb, par, &map_opt, + info->map_set.flags & IPSET_INV_MATCH); + if (!ret) + return XT_CONTINUE; + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK) + skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask)) + ^ (map_opt.ext.skbmark); + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO) + skb->priority = map_opt.ext.skbprio; + if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) && + skb->dev && + skb->dev->real_num_tx_queues > map_opt.ext.skbqueue) + skb_set_queue_mapping(skb, map_opt.ext.skbqueue); + } + return XT_CONTINUE; +} + + +static int +set_target_v3_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + return -ENOENT; + } + } + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_warn("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + !(par->hook_mask & (1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_warn("mapping of prio or/and queue is allowed only" + "from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + index = ip_set_nfnl_get_byindex(par->net, + info->map_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find map_set index %u as target\n", + info->map_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->del_set.index); + return -ENOENT; + } + } + + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX || + info->map_set.dim > IPSET_DIM_MAX) { + pr_warn("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v3_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +} + + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -493,6 +627,27 @@ static struct xt_target set_targets[] __read_mostly = { .destroy = set_target_v2_destroy, .me = THIS_MODULE }, + /* --map-set support */ + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV4, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV6, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, }; static int __init xt_set_init(void) -- cgit v1.2.3 From aef96193fe7b2791c4a3b19fe75426b929769471 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 17:30:54 +0200 Subject: netfilter: ipset: send nonzero skbinfo extensions only Do not send zero valued skbinfo extensions to userspace at listing. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index b97aac5142ed..f1606fa6132d 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -335,13 +335,17 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, static inline bool ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) { - return nla_put_net64(skb, IPSET_ATTR_SKBMARK, - cpu_to_be64((u64)skbinfo->skbmark << 32 | - skbinfo->skbmarkmask)) || - nla_put_net32(skb, IPSET_ATTR_SKBPRIO, - cpu_to_be32(skbinfo->skbprio)) || - nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, - cpu_to_be16(skbinfo->skbqueue)); + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask))) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); } -- cgit v1.2.3 From 5075314e4e4b559cc37675ad8a721a89bccd6284 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 15 Sep 2014 13:00:19 -0400 Subject: dsa: Split ops up, and avoid assigning tag_protocol and receive separately This change addresses several issues. First, it was possible to set tag_protocol without setting the ops pointer. To correct that I have reordered things so that rcv is now populated before we set tag_protocol. Second, it didn't make much sense to keep setting the device ops each time a new slave was registered. So by moving the receive portion out into root switch initialization that issue should be addressed. Third, I wanted to avoid sending tags if the rcv pointer was not registered so I changed the tag check to verify if the rcv function pointer is set on the root tree. If it is then we start sending DSA tagged frames. Finally I split the device ops pointer in the structures into two spots. I placed the rcv function pointer in the root switch since this makes it easiest to access from there, and I placed the xmit function pointer in the slave for the same reason. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ------- include/net/dsa.h | 10 ++++++---- net/dsa/dsa.c | 32 ++++++++++++++++++++++++++++---- net/dsa/dsa_priv.h | 11 ++++++++++- net/dsa/slave.c | 37 +++++++++++++++---------------------- net/dsa/tag_brcm.c | 1 - net/dsa/tag_dsa.c | 1 - net/dsa/tag_edsa.c | 1 - net/dsa/tag_trailer.c | 1 - 9 files changed, 59 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f9e81d10a3b9..28d4378615e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1928,13 +1928,6 @@ struct udp_offload { struct offload_callbacks callbacks; }; -struct dsa_device_ops { - netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); - int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); -}; - - /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 8a8a5d976f97..a55c4e6a4f0f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -77,7 +77,7 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; -struct dsa_device_ops; +struct packet_type; struct dsa_switch_tree { /* @@ -91,7 +91,10 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; - const struct dsa_device_ops *ops; + int (*rcv)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev); enum dsa_tag_protocol tag_protocol; /* @@ -218,7 +221,6 @@ static inline void *ds_to_priv(struct dsa_switch *ds) static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { - return dst->tag_protocol != DSA_TAG_PROTO_NONE; + return dst->rcv != NULL; } - #endif diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 61f145c44555..1df0a7cf1e9e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -154,9 +153,34 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, * tagging protocol to the preferred tagging format of this * switch. */ - if (ds->dst->cpu_switch == index) - ds->dst->tag_protocol = drv->tag_protocol; + if (dst->cpu_switch == index) { + switch (drv->tag_protocol) { +#ifdef CONFIG_NET_DSA_TAG_DSA + case DSA_TAG_PROTO_DSA: + dst->rcv = dsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + case DSA_TAG_PROTO_EDSA: + dst->rcv = edsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + case DSA_TAG_PROTO_TRAILER: + dst->rcv = trailer_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case DSA_TAG_PROTO_BRCM: + dst->rcv = brcm_netdev_ops.rcv; + break; +#endif + default: + break; + } + dst->tag_protocol = drv->tag_protocol; + } /* * Do basic register setup. @@ -626,7 +650,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } - return dst->ops->rcv(skb, dev, pt, orig_dev); + return dst->rcv(skb, dev, pt, orig_dev); } static struct packet_type dsa_pack_type __read_mostly = { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 98afed4d92ba..f90899e8ab5a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -12,7 +12,13 @@ #define __DSA_PRIV_H #include -#include +#include + +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; struct dsa_slave_priv { /* @@ -20,6 +26,8 @@ struct dsa_slave_priv { * switch port. */ struct net_device *dev; + netdev_tx_t (*xmit)(struct sk_buff *skb, + struct net_device *dev); /* * Which switch this port is a part of, and the port index @@ -43,6 +51,7 @@ struct dsa_slave_priv { extern char dsa_driver_version[]; /* slave.c */ +extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); struct net_device *dsa_slave_create(struct dsa_switch *ds, struct device *parent, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 809eeb13eb12..e38a331111c0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include @@ -176,9 +175,8 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch_tree *dst = p->parent->dst; - return dst->ops->xmit(skb, dev); + return p->xmit(skb, dev); } static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, @@ -325,11 +323,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; -static const struct dsa_device_ops notag_netdev_ops = { - .xmit = dsa_slave_notag_xmit, - .rcv = NULL, -}; - static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -435,41 +428,41 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->tx_queue_len = 0; slave_dev->netdev_ops = &dsa_slave_netdev_ops; + SET_NETDEV_DEV(slave_dev, parent); + slave_dev->dev.of_node = ds->pd->port_dn[port]; + slave_dev->vlan_features = master->vlan_features; + + p = netdev_priv(slave_dev); + p->dev = slave_dev; + p->parent = ds; + p->port = port; + switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case DSA_TAG_PROTO_DSA: - ds->dst->ops = &dsa_netdev_ops; + p->xmit = dsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case DSA_TAG_PROTO_EDSA: - ds->dst->ops = &edsa_netdev_ops; + p->xmit = edsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case DSA_TAG_PROTO_TRAILER: - ds->dst->ops = &trailer_netdev_ops; + p->xmit = trailer_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_BRCM case DSA_TAG_PROTO_BRCM: - ds->dst->ops = &brcm_netdev_ops; + p->xmit = brcm_netdev_ops.xmit; break; #endif default: - ds->dst->ops = ¬ag_netdev_ops; + p->xmit = dsa_slave_notag_xmit; break; } - SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->pd->port_dn[port]; - slave_dev->vlan_features = master->vlan_features; - - p = netdev_priv(slave_dev); - p->dev = slave_dev; - p->parent = ds; - p->port = port; - p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8fbc21c0de78..83d3572cdb20 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -11,7 +11,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index d7dbc5bda5c0..ce90c8bdc658 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 6b30abe89183..94fcce778679 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5fe9444842c5..115fdca34077 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" -- cgit v1.2.3 From b4d2394d01bc642e95b2cba956d908423c1bef77 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 15 Sep 2014 13:00:27 -0400 Subject: dsa: Replace mii_bus with a generic host device This change makes it so that instead of passing and storing a mii_bus we instead pass and store a host_dev. From there we can test to determine the exact type of device, and can verify it is the correct device for our switch. So for example it would be possible to pass a device pointer from a pci_dev and instead of checking for a PHY ID we could check for a vendor and/or device ID. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- arch/arm/plat-orion/common.c | 2 +- drivers/net/dsa/bcm_sf2.c | 2 +- drivers/net/dsa/mv88e6060.c | 13 +++++++++---- drivers/net/dsa/mv88e6123_61_65.c | 6 +++++- drivers/net/dsa/mv88e6131.c | 6 +++++- drivers/net/dsa/mv88e6171.c | 6 +++++- drivers/net/dsa/mv88e6xxx.c | 4 ++-- include/net/dsa.h | 9 +++++---- net/dsa/dsa.c | 24 ++++++++---------------- net/dsa/slave.c | 2 +- 10 files changed, 42 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c index 3ec6e8e8d368..f5b00f41c4f6 100644 --- a/arch/arm/plat-orion/common.c +++ b/arch/arm/plat-orion/common.c @@ -499,7 +499,7 @@ void __init orion_ge00_switch_init(struct dsa_platform_data *d, int irq) d->netdev = &orion_ge00.dev; for (i = 0; i < d->nr_chips; i++) - d->chip[i].mii_bus = &orion_ge00_shared.dev; + d->chip[i].host_dev = &orion_ge00_shared.dev; orion_switch_device.dev.platform_data = d; platform_device_register(&orion_switch_device); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index e9918c7f1792..02d7db320d90 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -129,7 +129,7 @@ static int bcm_sf2_sw_get_sset_count(struct dsa_switch *ds) return BCM_SF2_STATS_SIZE; } -static char *bcm_sf2_sw_probe(struct mii_bus *bus, int sw_addr) +static char *bcm_sf2_sw_probe(struct device *host_dev, int sw_addr) { return "Broadcom Starfighter 2"; } diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index d8037c1055ac..776e965dc9f4 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -21,7 +21,8 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg) { - return mdiobus_read(ds->master_mii_bus, ds->pd->sw_addr + addr, reg); + return mdiobus_read(to_mii_bus(ds->master_dev), + ds->pd->sw_addr + addr, reg); } #define REG_READ(addr, reg) \ @@ -37,8 +38,8 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg) static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) { - return mdiobus_write(ds->master_mii_bus, ds->pd->sw_addr + addr, - reg, val); + return mdiobus_write(to_mii_bus(ds->master_dev), + ds->pd->sw_addr + addr, reg, val); } #define REG_WRITE(addr, reg, val) \ @@ -50,10 +51,14 @@ static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) return __ret; \ }) -static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6060_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = mdiobus_read(bus, sw_addr + REG_PORT(0), 0x03); if (ret >= 0) { ret &= 0xfff0; diff --git a/drivers/net/dsa/mv88e6123_61_65.c b/drivers/net/dsa/mv88e6123_61_65.c index 975774f338c2..a332c53ff955 100644 --- a/drivers/net/dsa/mv88e6123_61_65.c +++ b/drivers/net/dsa/mv88e6123_61_65.c @@ -17,10 +17,14 @@ #include #include "mv88e6xxx.h" -static char *mv88e6123_61_65_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6123_61_65_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { if (ret == 0x1212) diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c index 35541f2ceca3..244c735014fa 100644 --- a/drivers/net/dsa/mv88e6131.c +++ b/drivers/net/dsa/mv88e6131.c @@ -22,10 +22,14 @@ #define ID_6095 0x0950 #define ID_6131 0x1060 -static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6131_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { ret &= 0xfff0; diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c index 03a70069a8c6..6365e30138af 100644 --- a/drivers/net/dsa/mv88e6171.c +++ b/drivers/net/dsa/mv88e6171.c @@ -17,10 +17,14 @@ #include #include "mv88e6xxx.h" -static char *mv88e6171_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6171_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { if ((ret & 0xfff0) == 0x1710) diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 901d2a9704ef..d6f6428b27dc 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -78,7 +78,7 @@ int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg) int ret; mutex_lock(&ps->smi_mutex); - ret = __mv88e6xxx_reg_read(ds->master_mii_bus, + ret = __mv88e6xxx_reg_read(to_mii_bus(ds->master_dev), ds->pd->sw_addr, addr, reg); mutex_unlock(&ps->smi_mutex); @@ -122,7 +122,7 @@ int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) int ret; mutex_lock(&ps->smi_mutex); - ret = __mv88e6xxx_reg_write(ds->master_mii_bus, + ret = __mv88e6xxx_reg_write(to_mii_bus(ds->master_dev), ds->pd->sw_addr, addr, reg, val); mutex_unlock(&ps->smi_mutex); diff --git a/include/net/dsa.h b/include/net/dsa.h index a55c4e6a4f0f..c779e9bba1b3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -34,7 +34,7 @@ struct dsa_chip_data { /* * How to access the switch configuration registers. */ - struct device *mii_bus; + struct device *host_dev; int sw_addr; /* Device tree node pointer for this specific switch chip @@ -134,9 +134,9 @@ struct dsa_switch { struct dsa_switch_driver *drv; /* - * Reference to mii bus to use. + * Reference to host device to use. */ - struct mii_bus *master_mii_bus; + struct device *master_dev; /* * Slave mii_bus and devices for the individual ports. @@ -178,7 +178,7 @@ struct dsa_switch_driver { /* * Probing and setup. */ - char *(*probe)(struct mii_bus *bus, int sw_addr); + char *(*probe)(struct device *host_dev, int sw_addr); int (*setup)(struct dsa_switch *ds); int (*set_addr)(struct dsa_switch *ds, u8 *addr); @@ -213,6 +213,7 @@ struct dsa_switch_driver { void register_switch_driver(struct dsa_switch_driver *type); void unregister_switch_driver(struct dsa_switch_driver *type); +struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); static inline void *ds_to_priv(struct dsa_switch *ds) { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 1df0a7cf1e9e..b34d6978d773 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -43,7 +43,7 @@ void unregister_switch_driver(struct dsa_switch_driver *drv) EXPORT_SYMBOL_GPL(unregister_switch_driver); static struct dsa_switch_driver * -dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) +dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name) { struct dsa_switch_driver *ret; struct list_head *list; @@ -58,7 +58,7 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) drv = list_entry(list, struct dsa_switch_driver, list); - name = drv->probe(bus, sw_addr); + name = drv->probe(host_dev, sw_addr); if (name != NULL) { ret = drv; break; @@ -75,7 +75,7 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) /* basic switch operations **************************************************/ static struct dsa_switch * dsa_switch_setup(struct dsa_switch_tree *dst, int index, - struct device *parent, struct mii_bus *bus) + struct device *parent, struct device *host_dev) { struct dsa_chip_data *pd = dst->pd->chip + index; struct dsa_switch_driver *drv; @@ -88,7 +88,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(bus, pd->sw_addr, &name); + drv = dsa_switch_probe(host_dev, pd->sw_addr, &name); if (drv == NULL) { printk(KERN_ERR "%s[%d]: could not detect attached switch\n", dst->master_netdev->name, index); @@ -109,8 +109,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->index = index; ds->pd = dst->pd->chip + index; ds->drv = drv; - ds->master_mii_bus = bus; - + ds->master_dev = host_dev; /* * Validate supplied switch configuration. @@ -285,7 +284,7 @@ static struct device *dev_find_class(struct device *parent, char *class) return device_find_child(parent, class, dev_is_class); } -static struct mii_bus *dev_to_mii_bus(struct device *dev) +struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) { struct device *d; @@ -301,6 +300,7 @@ static struct mii_bus *dev_to_mii_bus(struct device *dev) return NULL; } +EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); static struct net_device *dev_to_net_device(struct device *dev) { @@ -566,17 +566,9 @@ static int dsa_probe(struct platform_device *pdev) dst->cpu_port = -1; for (i = 0; i < pd->nr_chips; i++) { - struct mii_bus *bus; struct dsa_switch *ds; - bus = dev_to_mii_bus(pd->chip[i].mii_bus); - if (bus == NULL) { - printk(KERN_ERR "%s[%d]: no mii bus found for " - "dsa switch\n", dev->name, i); - continue; - } - - ds = dsa_switch_setup(dst, i, &pdev->dev, bus); + ds = dsa_switch_setup(dst, i, &pdev->dev, pd->chip[i].host_dev); if (IS_ERR(ds)) { printk(KERN_ERR "%s[%d]: couldn't create dsa switch " "instance (error %ld)\n", dev->name, i, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e38a331111c0..90c9689ed362 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -44,7 +44,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", ds->index, ds->pd->sw_addr); - ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; + ds->slave_mii_bus->parent = ds->master_dev; } -- cgit v1.2.3 From 6cff339bbd5f9eda7a5e8a521f91a88d046e6d0c Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:20 -0700 Subject: ipvs: Add destination address family to netlink interface This is necessary to support heterogeneous pools. For example, if you have an ipv6 addressed network, you'll want to be able to forward ipv4 traffic into it. This patch enforces that destination address family is the same as service family, as none of the forwarding mechanisms support anything else. For the old setsockopt mechanism, we simply set the dest address family to AF_INET as we do with the service. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 +++ include/uapi/linux/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 45 +++++++++++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 624a8a54806d..b7e2b624d58e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -648,6 +648,9 @@ struct ip_vs_dest_user_kern { /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ + + /* Address family of addr */ + u16 af; }; diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index fbcffe8041f7..cabe95d5b461 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -384,6 +384,9 @@ enum { IPVS_DEST_ATTR_PERSIST_CONNS, /* persistent connections */ IPVS_DEST_ATTR_STATS, /* nested attribute for dest stats */ + + IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index bd2b208ba56c..594cec7ebc43 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -816,6 +816,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + dest->af = udest->af; + spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); @@ -846,8 +848,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); + /* Temporary for consistency */ + if (udest->af != svc->af) + return -EINVAL; + #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { + if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -875,12 +881,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, u64_stats_init(&ip_vs_dest_stats->syncp); } - dest->af = svc->af; + dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -928,7 +934,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -949,7 +955,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), @@ -992,7 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -2244,6 +2250,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; + udest->af = AF_INET; } static int @@ -2480,6 +2487,12 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; + /* Cannot expose heterogeneous members via sockopt + * interface + */ + if (dest->af != svc->af) + continue; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); @@ -2777,6 +2790,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, + [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3032,7 +3046,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns))) + atomic_read(&dest->persistconns)) || + nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3113,6 +3128,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; + struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || @@ -3121,6 +3137,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; + nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; @@ -3130,6 +3147,11 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); + if (nla_addr_family) + udest->af = nla_get_u16(nla_addr_family); + else + udest->af = 0; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, @@ -3357,6 +3379,15 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) need_full_dest); if (ret) goto out; + + /* Old protocols did not allow the user to specify address + * family, so we set it to zero instead. We also didn't + * allow heterogeneous pools in the old code, so it's safe + * to assume that this will have the same address family as + * the service. + */ + if (udest.af == 0) + udest.af = svc->af; } switch (cmd) { -- cgit v1.2.3 From 655eef103d0bd99f540a52f7ede032e120756846 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:21 -0700 Subject: ipvs: Supply destination addr family to ip_vs_{lookup_dest,find_dest} We need to remove the assumption that virtual address family is the same as real address family in order to support heterogeneous services (that is, services with v4 vips and v6 backends or the opposite). Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 5 +++-- net/netfilter/ipvs/ip_vs_conn.c | 8 +++++++- net/netfilter/ipvs/ip_vs_ctl.c | 24 ++++++++++++------------ net/netfilter/ipvs/ip_vs_sync.c | 10 ++++++++-- 4 files changed, 30 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b7e2b624d58e..2fa1155b24f7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1399,8 +1399,9 @@ void ip_vs_unregister_nl_ioctl(void); int ip_vs_control_init(void); void ip_vs_control_cleanup(void); struct ip_vs_dest * -ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, - __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, +ip_vs_find_dest(struct net *net, int svc_af, int dest_af, + const union nf_inet_addr *daddr, __be16 dport, + const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags); void ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 610e19c0e13f..8f4c602bb34b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -616,7 +616,13 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; rcu_read_lock(); - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + + /* This function is only invoked by the synchronization code. We do + * not currently support heterogeneous pools with synchronization, + * so we can make the assumption that the svc_af is the same as the + * dest_af + */ + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 594cec7ebc43..c840d895bd1a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -574,8 +574,8 @@ bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, * Called under RCU lock. */ static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; @@ -583,9 +583,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { + if ((dest->af == dest_af) && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && + (dest->port == dport)) { /* HIT */ return dest; } @@ -602,7 +602,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -613,14 +613,14 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; - dest = ip_vs_lookup_dest(svc, daddr, port); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) - dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } @@ -938,7 +938,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { @@ -1002,7 +1002,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { @@ -1084,7 +1084,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index eadffb29dec0..edd266414b7d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -880,8 +880,14 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * but still handled. */ rcu_read_lock(); - dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark, flags); + /* This function is only invoked by the synchronization + * code. We do not currently support heterogeneous pools + * with synchronization, so we can make the assumption that + * the svc_af is the same as the dest_af + */ + dest = ip_vs_find_dest(net, type, type, daddr, dport, + param->vaddr, param->vport, protocol, + fwmark, flags); cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); rcu_read_unlock(); -- cgit v1.2.3 From ba38528aae6ee2d22226c6a78727ddc13512b068 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:23 -0700 Subject: ipvs: Supply destination address family to ip_vs_conn_new The assumption that dest af is equal to service af is now unreliable, so we must specify it manually so as not to copy just the first 4 bytes of a v6 address or doing an illegal read of 16 butes on a v6 address. We "lie" in two places: for synchronization (which we will explicitly disallow from happening when we have heterogeneous pools) and for black hole addresses where there's no real dest. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 ++- net/netfilter/ipvs/ip_vs_conn.c | 5 +++-- net/netfilter/ipvs/ip_vs_core.c | 9 +++++---- net/netfilter/ipvs/ip_vs_ftp.c | 6 ++++-- net/netfilter/ipvs/ip_vs_sync.c | 3 ++- 5 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2fa1155b24f7..7600dbe5780e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -535,6 +535,7 @@ struct ip_vs_conn { union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ + __u16 daf; /* Address family of the dest */ #ifdef CONFIG_NET_NS struct net *net; /* Name space */ #endif @@ -1213,7 +1214,7 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); -struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, +struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 8f4c602bb34b..fdb4880a3a79 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -854,7 +854,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(const struct ip_vs_conn_param *p, +ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { @@ -873,6 +873,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); ip_vs_conn_net_set(cp, p->net); cp->af = p->af; + cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; @@ -880,7 +881,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; - ip_vs_addr_set(p->af, &cp->daddr, daddr); + ip_vs_addr_set(cp->daf, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5c34e8d42e01..1f6ecb74821f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -328,7 +328,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ - ct = ip_vs_conn_new(¶m, &dest->addr, dport, + ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); @@ -357,7 +357,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, src_port, &iph->daddr, dst_port, ¶m); - cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, + skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; @@ -479,7 +480,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &dest->addr, + cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); if (!cp) { @@ -550,7 +551,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &daddr, 0, + cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 77c173282f38..a64fa15790e5 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -233,7 +233,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); - n_cp = ip_vs_conn_new(&p, &from, port, + /* As above, this is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -396,7 +397,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - n_cp = ip_vs_conn_new(&p, &cp->daddr, + /* This is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index edd266414b7d..7162c86fd50d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -889,7 +889,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, param->vaddr, param->vport, protocol, fwmark, flags); - cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, + fwmark); rcu_read_unlock(); if (!cp) { kfree(param->pe_data); -- cgit v1.2.3 From 391f503d69779867f05e9296ae523e9002c2d7ee Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:24 -0700 Subject: ipvs: prevent mixing heterogeneous pools and synchronization The synchronization protocol is not compatible with heterogeneous pools, so we need to verify that we're not turning both on at the same time. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ net/netfilter/ipvs/ip_vs_ctl.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7600dbe5780e..576d7f0bed5d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -990,6 +990,10 @@ struct netns_ipvs { char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* net name space ptr */ struct net *net; /* Needed by timer routines */ + /* Number of heterogeneous destinations, needed because + * heterogeneous are not supported when synchronization is + * enabled */ + unsigned int mixed_address_family_dests; }; #define DEFAULT_SYNC_THRESHOLD 3 diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6bd2cc682137..462760eded94 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -779,6 +779,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_scheduler *sched; int conn_flags; + /* We cannot modify an address and change the address family */ + BUG_ON(!add && udest->af != dest->af); + + if (add && udest->af != svc->af) + ipvs->mixed_address_family_dests++; + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; @@ -1061,6 +1067,9 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, list_del_rcu(&dest->n_list); svc->num_dests--; + if (dest->af != svc->af) + net_ipvs(svc->net)->mixed_address_family_dests--; + if (svcupd) { struct ip_vs_scheduler *sched; @@ -3256,6 +3265,12 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + /* The synchronization protocol is incompatible with mixed family + * services + */ + if (net_ipvs(net)->mixed_address_family_dests > 0) + return -EINVAL; + return start_sync_thread(net, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), -- cgit v1.2.3 From 971427f353f3c42c8dcef62e7124440df68eb809 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 15 Sep 2014 19:37:25 -0700 Subject: openvswitch: Add recirc and hash action. Recirc action allows a packet to reenter openvswitch processing. currently openvswitch lookup flow for packet received and execute set of actions on that packet, with help of recirc action we can process/modify the packet and recirculate it back in openvswitch for another pass. OVS hash action calculates 5-tupple hash and set hash in flow-key hash. This can be used along with recirculation for distributing packets among different ports for bond devices. For example: OVS bonding can use following actions: Match on: bond flow; Action: hash, recirc(id) Match on: recirc-id == id and hash lower bits == a; Action: output port_bond_a Signed-off-by: Andy Zhou Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 26 +++++ net/openvswitch/actions.c | 203 +++++++++++++++++++++++++++++++++++++-- net/openvswitch/datapath.c | 11 ++- net/openvswitch/datapath.h | 7 +- net/openvswitch/flow.c | 7 +- net/openvswitch/flow.h | 3 + net/openvswitch/flow_netlink.c | 43 ++++++++- 7 files changed, 288 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a794d1dd7b40..f7fc507d82ab 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -289,6 +289,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ + OVS_KEY_ATTR_DP_HASH, /* u32 hash value. Value 0 indicates the hash + is not computed by the datapath. */ + OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -493,6 +496,27 @@ struct ovs_action_push_vlan { __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; +/* Data path hash algorithm for computing Datapath hash. + * + * The algorithm type only specifies the fields in a flow + * will be used as part of the hash. Each datapath is free + * to use its own hash algorithm. The hash value will be + * opaque to the user space daemon. + */ +enum ovs_hash_alg { + OVS_HASH_ALG_L4, +}; + +/* + * struct ovs_action_hash - %OVS_ACTION_ATTR_HASH action argument. + * @hash_alg: Algorithm used to compute hash prior to recirculation. + * @hash_basis: basis used for computing hash. + */ +struct ovs_action_hash { + uint32_t hash_alg; /* One of ovs_hash_alg. */ + uint32_t hash_basis; +}; + /** * enum ovs_action_attr - Action types. * @@ -521,6 +545,8 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */ OVS_ACTION_ATTR_POP_VLAN, /* No argument. */ OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ + OVS_ACTION_ATTR_RECIRC, /* u32 recirc_id. */ + OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ __OVS_ACTION_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c31bb80c984f..6932a42e41a2 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -35,12 +35,78 @@ #include #include "datapath.h" +#include "flow.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len); +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} + static int make_writable(struct sk_buff *skb, int write_len) { if (!pskb_may_pull(skb, write_len)) @@ -485,8 +551,29 @@ static int sample(struct datapath *dp, struct sk_buff *skb, /* Skip the sample action when out of memory. */ return 0; - /* do_execute_actions() will consume the cloned skb. */ - return do_execute_actions(dp, skb, key, a, rem); + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; } static int execute_set_action(struct sk_buff *skb, @@ -535,6 +622,44 @@ static int execute_set_action(struct sk_buff *skb, return err; } +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + + if (!last_action(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -566,6 +691,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, key, a); break; + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -576,6 +705,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = pop_vlan(skb); break; + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (last_action(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, nla_data(a)); break; @@ -601,12 +741,63 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return 0; } +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key) { - struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + int level = this_cpu_read(exec_actions_level); + struct sw_flow_actions *acts; + int err; + + acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + this_cpu_inc(exec_actions_level); + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); + + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; - return do_execute_actions(dp, skb, key, - acts->actions, acts->actions_len); + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7e0819919b8a..16cad14fa81e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -156,7 +156,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -static const char *ovs_dp_name(const struct datapath *dp) +const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -2065,10 +2065,14 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = ovs_internal_dev_rtnl_link_register(); + err = action_fifos_init(); if (err) goto error; + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; @@ -2101,6 +2105,8 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); error: return err; } @@ -2114,6 +2120,7 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 25b0e888cb27..ac3f3df96961 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -189,13 +189,18 @@ void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); +const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *); + void ovs_dp_notify_wq(struct work_struct *work); +int action_fifos_init(void); +void action_fifos_exit(void); + #define OVS_NLERR(fmt, ...) \ do { \ if (net_ratelimit()) \ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bf8442071d75..4010423f2831 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -606,6 +606,11 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 3869a540365c..0f5db4ec565d 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -72,6 +72,8 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -187,6 +189,7 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 630b320fbf3e..f4c8daa73965 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -251,6 +251,8 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), + [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, }; @@ -454,6 +456,20 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { SW_FLOW_KEY_PUT(match, phy.priority, nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); @@ -873,6 +889,12 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, struct nlattr *nla, *encap; bool is_mask = (swkey != output); + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; @@ -1401,11 +1423,13 @@ int ovs_nla_copy_actions(const struct nlattr *attr, /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1432,6 +1456,18 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } case OVS_ACTION_ATTR_POP_VLAN: break; @@ -1444,6 +1480,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_RECIRC: + break; + case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, &skip_copy); if (err) -- cgit v1.2.3 From 0097db06f5ab2df1756bc4cbf4395593024d87a1 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 16 Sep 2014 21:36:09 +0200 Subject: Bluetooth: Remove exported hci_recv_fragment function The hci_recv_fragment function is no longer used by any driver and thus do not export it. In fact it is not even needed by the core and it can be removed altogether. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 - net/bluetooth/hci_core.c | 20 -------------------- 2 files changed, 21 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 206b92bfeebb..37ff1aef0845 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -926,7 +926,6 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); -int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count); int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count); void hci_init_sysfs(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 41948678f514..cb05d7f16a34 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4374,26 +4374,6 @@ static int hci_reassembly(struct hci_dev *hdev, int type, void *data, return remain; } -int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count) -{ - int rem = 0; - - if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) - return -EILSEQ; - - while (count) { - rem = hci_reassembly(hdev, type, data, count, type - 1); - if (rem < 0) - return rem; - - data += (count - rem); - count = rem; - } - - return rem; -} -EXPORT_SYMBOL(hci_recv_fragment); - #define STREAM_REASSEMBLY 0 int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count) -- cgit v1.2.3 From 689f1c9de2abbd76fda224d12cea5f43568a4335 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Sep 2014 16:38:18 +0800 Subject: ipsec: Remove obsolete MAX_AH_AUTH_LEN While tracking down the MAX_AH_AUTH_LEN crash in an old kernel I thought that this limit was rather arbitrary and we should just get rid of it. In fact it seems that we've already done all the work needed to remove it apart from actually removing it. This limit was there in order to limit stack usage. Since we've already switched over to allocating scratch space using kmalloc, there is no longer any need to limit the authentication length. This patch kills all references to it, including the BUG_ONs that led me here. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/ah.h | 3 --- net/ipv4/ah4.c | 2 -- net/ipv6/ah6.c | 2 -- net/xfrm/xfrm_user.c | 3 +-- 4 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ah.h b/include/net/ah.h index ca95b98969dd..4e2dfa474a7e 100644 --- a/include/net/ah.h +++ b/include/net/ah.h @@ -3,9 +3,6 @@ #include -/* This is the maximum truncated ICV length that we know of. */ -#define MAX_AH_AUTH_LEN 64 - struct crypto_ahash; struct ah_data { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a2afa89513a0..ac9a32ec3ee4 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -505,8 +505,6 @@ static int ah_init_state(struct xfrm_state *x) ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; - BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index fcffd4e522c8..6d16eb0e0c7f 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -713,8 +713,6 @@ static int ah6_init_state(struct xfrm_state *x) ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; - BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); switch (x->props.mode) { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index eaf8a8f1cbe8..e812e988c111 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -333,8 +333,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, algo = xfrm_aalg_get_byname(ualg->alg_name, 1); if (!algo) return -ENOSYS; - if ((ualg->alg_trunc_len / 8) > MAX_AH_AUTH_LEN || - ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) + if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) return -EINVAL; *props = algo->desc.sadb_alg_id; -- cgit v1.2.3 From 84d7fce693884897c6196cc98228a2ad56ae2a9a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Sep 2014 14:30:22 +0200 Subject: netfilter: nf_tables: export rule-set generation ID This patch exposes the ruleset generation ID in three ways: 1) The new command NFT_MSG_GETGEN that exposes the 32-bits ruleset generation ID. This ID is incremented in every commit and it should be large enough to avoid wraparound problems. 2) The less significant 16-bits of the generation ID are exposed through the nfgenmsg->res_id header field. This allows us to quickly catch if the ruleset has change between two consecutive list dumps from different object lists (in this specific case I think the risk of wraparound is unlikely). 3) Userspace subscribers may receive notifications of new rule-set generation after every commit. This also provides an alternative way to monitor the generation ID. If the events are lost, the userspace process hits a overrun error, so it knows that it is working with a stale ruleset anyway. Patrick spotted that rule-set transformations in userspace may take quite some time. In that case, it annotates the 32-bits generation ID before fetching the rule-set, then: 1) it compares it to what we obtain after the transformation to make sure it is not working with a stale rule-set and no wraparound has ocurred. 2) it subscribes to ruleset notifications, so it can watch for new generation ID. This is complementary to the NLM_F_DUMP_INTR approach, which allows us to detect an interference in the middle one single list dumping. There is no way to explicitly check that an interference has occurred between two list dumps from the kernel, since it doesn't know how many lists the userspace client is actually going to dump. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 16 ++++ net/netfilter/nf_tables_api.c | 140 +++++++++++++++++++++++++------ 2 files changed, 130 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66d66dd3ff79..b72ccfeaf865 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -51,6 +51,8 @@ enum nft_verdicts { * @NFT_MSG_NEWSETELEM: create a new set element (enum nft_set_elem_attributes) * @NFT_MSG_GETSETELEM: get a set element (enum nft_set_elem_attributes) * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) + * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) + * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -68,6 +70,8 @@ enum nf_tables_msg_types { NFT_MSG_NEWSETELEM, NFT_MSG_GETSETELEM, NFT_MSG_DELSETELEM, + NFT_MSG_NEWGEN, + NFT_MSG_GETGEN, NFT_MSG_MAX, }; @@ -812,4 +816,16 @@ enum nft_masq_attributes { }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) +/** + * enum nft_gen_attributes - nf_tables ruleset generation attributes + * + * @NFTA_GEN_ID: Ruleset generation ID (NLA_U32) + */ +enum nft_gen_attributes { + NFTA_GEN_UNSPEC, + NFTA_GEN_ID, + __NFTA_GEN_MAX +}; +#define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 82374601577e..a476b9962155 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -405,9 +405,9 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; -static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table) +static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -420,7 +420,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || @@ -448,8 +448,8 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -488,7 +488,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_table_info(skb, + if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, @@ -540,7 +540,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) @@ -914,9 +914,9 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table, +static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, const struct nft_chain *chain) { struct nlmsghdr *nlh; @@ -930,7 +930,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; @@ -988,8 +988,8 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -1031,7 +1031,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, @@ -1090,7 +1091,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) @@ -1647,8 +1648,9 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, }; -static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, +static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule) @@ -1668,7 +1670,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; @@ -1724,8 +1726,8 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -1771,7 +1773,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, @@ -1837,7 +1839,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule); if (err < 0) @@ -2321,7 +2323,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -2925,7 +2927,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) goto nla_put_failure; @@ -3006,7 +3008,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -3293,6 +3295,87 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, return err; } +static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -EMSGSIZE; +} + +static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + int err; + + if (nlmsg_report(nlh) && + !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + goto err; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) { + kfree_skb(skb2); + goto err; + } + + err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, + NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int err; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call_batch = nf_tables_newtable, @@ -3369,6 +3452,9 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETGEN] = { + .call = nf_tables_getgen, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -3526,6 +3612,8 @@ static int nf_tables_commit(struct sk_buff *skb) call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); } + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + return 0; } -- cgit v1.2.3 From fd384412e199b62c3ddaabd18dce86d0e164c5b9 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 16 Sep 2014 17:31:16 -0700 Subject: udp_tunnel: Seperate ipv6 functions into its own file. Add ip6_udp_tunnel.c for ipv6 UDP tunnel functions to avoid ifdefs in udp_tunnel.c Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 28 ++++++++++++++-- net/ipv4/udp_tunnel.c | 85 ++++++++++++----------------------------------- net/ipv6/Makefile | 1 + net/ipv6/ip6_udp_tunnel.c | 63 +++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 66 deletions(-) create mode 100644 net/ipv6/ip6_udp_tunnel.c (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index ffd69cbded35..0b9e017c9038 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -26,7 +26,31 @@ struct udp_port_cfg { use_udp6_rx_checksums:1; }; -int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp); +int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp); + +#if IS_ENABLED(CONFIG_IPV6) +int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp); +#else +static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + return 0; +} +#endif + +static inline int udp_sock_create(struct net *net, + struct udp_port_cfg *cfg, + struct socket **sockp) +{ + if (cfg->family == AF_INET) + return udp_sock_create4(net, cfg, sockp); + + if (cfg->family == AF_INET6) + return udp_sock_create6(net, cfg, sockp); + + return -EPFNOSUPPORT; +} #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 61ec1a65207e..7fccf6c3417a 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -8,83 +8,40 @@ #include #include -int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) +int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) { int err = -EINVAL; struct socket *sock = NULL; + struct sockaddr_in udp_addr; -#if IS_ENABLED(CONFIG_IPV6) - if (cfg->family == AF_INET6) { - struct sockaddr_in6 udp6_addr; + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; - - sk_change_net(sock->sk, net); - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, - sizeof(udp6_addr)); - if (err < 0) - goto error; - - if (cfg->peer_udp_port) { - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = cfg->peer_udp_port; - err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, - sizeof(udp6_addr), 0); - } - if (err < 0) - goto error; - - udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); - udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); - } else -#endif - if (cfg->family == AF_INET) { - struct sockaddr_in udp_addr; - - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; + sk_change_net(sock->sk, net); - sk_change_net(sock->sk, net); + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr)); + if (err < 0) + goto error; + if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->local_ip; - udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, - sizeof(udp_addr)); + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = cfg->peer_udp_port; + err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr), 0); if (err < 0) goto error; - - if (cfg->peer_udp_port) { - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->peer_ip; - udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, - (struct sockaddr *)&udp_addr, - sizeof(udp_addr), 0); - if (err < 0) - goto error; - } - - sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; - } else { - return -EPFNOSUPPORT; } + sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; *sockp = sock; - return 0; error: @@ -95,6 +52,6 @@ error: *sockp = NULL; return err; } -EXPORT_SYMBOL(udp_sock_create); +EXPORT_SYMBOL(udp_sock_create4); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2fe68364bb20..45f830e5f820 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_SIT) += sit.o diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c new file mode 100644 index 000000000000..bcfbb4b496fc --- /dev/null +++ b/net/ipv6/ip6_udp_tunnel.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + struct sockaddr_in6 udp6_addr; + int err; + struct socket *sock = NULL; + + err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr), 0); + } + if (err < 0) + goto error; + + udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); + udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); + + *sockp = sock; + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL_GPL(udp_sock_create6); -- cgit v1.2.3 From 6a93cc9052748c6355ec9d5b6c38b77f85f1cb0d Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 16 Sep 2014 17:31:17 -0700 Subject: udp-tunnel: Add a few more UDP tunnel APIs Added a few more UDP tunnel APIs that can be shared by UDP based tunnel protocol implementation. The main ones are highlighted below. setup_udp_tunnel_sock() configures UDP listener socket for receiving UDP encapsulated packets. udp_tunnel_xmit_skb() and upd_tunnel6_xmit_skb() transmit skb using UDP encapsulation. udp_tunnel_sock_release() closes the UDP tunnel listener socket. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel.c | 53 ++++++++++++++++++++++++++++++++++++++++++- net/ipv6/ip6_udp_tunnel.c | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 0b9e017c9038..a47790bcaa38 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -1,6 +1,14 @@ #ifndef __NET_UDP_TUNNEL_H #define __NET_UDP_TUNNEL_H +#include +#include + +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif + struct udp_port_cfg { u8 family; @@ -53,4 +61,53 @@ static inline int udp_sock_create(struct net *net, return -EPFNOSUPPORT; } +typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); +typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); + +struct udp_tunnel_sock_cfg { + void *sk_user_data; /* user data used by encap_rcv call back */ + /* Used for setting up udp_sock fields, see udp.h for details */ + __u8 encap_type; + udp_tunnel_encap_rcv_t encap_rcv; + udp_tunnel_encap_destroy_t encap_destroy; +}; + +/* Setup the given (UDP) sock to receive UDP encapsulated packets */ +void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *sock_cfg); + +/* Transmit the skb using UDP encapsulation. */ +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, + __be16 dst_port, bool xnet); + +#if IS_ENABLED(CONFIG_IPV6) +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, + struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, + __be16 dst_port); +#endif + +void udp_tunnel_sock_release(struct socket *sock); + +static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, + bool udp_csum) +{ + int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + return iptunnel_handle_offloads(skb, udp_csum, type); +} + +static inline void udp_tunnel_encap_enable(struct socket *sock) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sock->sk->sk_family == PF_INET6) + ipv6_stub->udpv6_encap_enable(); + else +#endif + udp_encap_enable(); +} + #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 7fccf6c3417a..1671263e5fa0 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -11,7 +11,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { - int err = -EINVAL; + int err; struct socket *sock = NULL; struct sockaddr_in udp_addr; @@ -54,4 +54,55 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *cfg) +{ + struct sock *sk = sock->sk; + + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + + /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ + udp_set_convert_csum(sk, true); + + rcu_assign_sk_user_data(sk, cfg->sk_user_data); + + udp_sk(sk)->encap_type = cfg->encap_type; + udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_destroy = cfg->encap_destroy; + + udp_tunnel_encap_enable(sock); +} +EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); + +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, + __be16 dst_port, bool xnet) +{ + struct udphdr *uh; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + uh->len = htons(skb->len); + + udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); + + return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, + tos, ttl, df, xnet); +} +EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); + +void udp_tunnel_sock_release(struct socket *sock) +{ + rcu_assign_sk_user_data(sock->sk, NULL); + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); +} +EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); + MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index bcfbb4b496fc..cbc990712cc1 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -61,3 +61,45 @@ error: return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); + +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, + struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port) +{ + struct udphdr *uh; + struct ipv6hdr *ip6h; + struct sock *sk = sock->sk; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst); + + udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, + &sk->sk_v6_daddr, skb->len); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + ip6tunnel_xmit(skb, dev); + return 0; +} +EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); -- cgit v1.2.3 From 2e4e44107176d552f8bb1bb76053e850e3809841 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2014 04:49:49 -0700 Subject: net: add alloc_skb_with_frags() helper Extract from sock_alloc_send_pskb() code building skb with frags, so that we can reuse this in other contexts. Intent is to use it from tcp_send_rcvq(), tcp_collapse(), ... We also want to replace some skb_linearize() calls to a more reliable strategy in pathological cases where we need to reduce number of frags. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++ net/core/skbuff.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 78 ++++++++++---------------------------------------- 3 files changed, 99 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 756e3d057e84..f1bfa3781c75 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -769,6 +769,12 @@ static inline struct sk_buff *alloc_skb(unsigned int size, return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask); + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 29f7f0121491..06a8feb10099 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4102,3 +4102,81 @@ err_free: return NULL; } EXPORT_SYMBOL(skb_vlan_untag); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * header_len: size of linear part + * data_len: needed length in frags + * max_page_order: max page order desired. + * errcode: pointer to error code if any + * gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + gfp_t gfp_head; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + gfp_head = gfp_mask; + if (gfp_head & __GFP_WAIT) + gfp_head |= __GFP_REPEAT; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_head); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(gfp_mask | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); diff --git a/net/core/sock.c b/net/core/sock.c index 6f436b5e4961..de887c45c63b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1762,21 +1762,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { - struct sk_buff *skb = NULL; - unsigned long chunk; - gfp_t gfp_mask; + struct sk_buff *skb; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - struct page *page; - int i; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; timeo = sock_sndtimeo(sk, noblock); - while (!skb) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; @@ -1785,66 +1776,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); - continue; - } - - err = -ENOBUFS; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; + if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + break; - skb = alloc_skb(header_len, gfp_mask); - if (!skb) + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) goto failure; - - skb->truesize += data_len; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages(sk->sk_allocation | - __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } - order--; - } - page = alloc_page(sk->sk_allocation); - if (!page) - goto failure; -fill_page: - chunk = min_t(unsigned long, data_len, - PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); - data_len -= chunk; - npages -= 1 << order; - } + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: - kfree_skb(skb); *errcode = err; return NULL; } -- cgit v1.2.3 From bb7d93496f7ac203f7c3e9678000d1c83eb4e0ba Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Sep 2014 13:07:50 -0700 Subject: net: phy: broadcom: add helper for PHY revision and patch level The Broadcom BCM7xxx internal PHYs do not contain any useful revision information in the low 4-bits of their MII_PHYSID2 (MII register 3) which could allow us to properly identify them. As a result, we need the actual hardware block integrating these PHYs: GENET or the SF2 switch to tell us what revision they are built with. To assist with that, add two helper macros for fetching the the PHY revision and patch level from the struct phy_device::dev_flags. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 5bd35cc0d471..e4ee9c6679e8 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -41,6 +41,8 @@ #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_100MBPS_WAR 0x00010000 +#define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) +#define PHY_BRCM_7XXX_PATCH(x) ((x) & 0xff) #define PHY_BCM_FLAGS_VALID 0x80000000 /* Broadcom BCM54XX register definitions, common to most Broadcom PHYs */ -- cgit v1.2.3 From 80780a54ecded1647e661ababde13554a149f7f3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Sep 2014 13:07:52 -0700 Subject: net: bcmgenet: remove PHY_BRCM_100MBPS_WAR Now that we have removed the need for the PHY_BRCM_100MBPS_WAR flag, we can remove it from the GENET driver and the broadcom shared header file. The PHY driver checks the PHY supported bitmask instead. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmmii.c | 10 ---------- include/linux/brcmphy.h | 1 - 2 files changed, 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index c88f7ae99636..71f2ef7d33a9 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -296,7 +296,6 @@ static int bcmgenet_mii_probe(struct net_device *dev) struct bcmgenet_priv *priv = netdev_priv(dev); struct device_node *dn = priv->pdev->dev.of_node; struct phy_device *phydev; - unsigned int phy_flags; int ret; if (priv->phydev) { @@ -338,15 +337,6 @@ static int bcmgenet_mii_probe(struct net_device *dev) return ret; } - phy_flags = PHY_BRCM_100MBPS_WAR; - - /* workarounds are only needed for 100Mpbs PHYs, and - * never on GENET V1 hardware - */ - if ((phydev->supported & PHY_GBIT_FEATURES) || GENET_IS_V1(priv)) - phy_flags = 0; - - phydev->dev_flags |= phy_flags; phydev->advertising = phydev->supported; /* The internal PHY has its link interrupts routed to the diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index e4ee9c6679e8..3f626fe48c9a 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -40,7 +40,6 @@ #define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 /* Broadcom BCM7xxx specific workarounds */ -#define PHY_BRCM_100MBPS_WAR 0x00010000 #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) #define PHY_BRCM_7XXX_PATCH(x) ((x) & 0xff) #define PHY_BCM_FLAGS_VALID 0x80000000 -- cgit v1.2.3 From 6819563e646a7f3692836daefd12cd86c697759f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Sep 2014 13:07:54 -0700 Subject: net: dsa: allow switch drivers to specify phy_device::dev_flags Some switch drivers (e.g: bcm_sf2) may have to communicate specific workarounds or flags towards the PHY device driver. Allow switches driver to be delegated that task by introducing a get_phy_flags() callback which will do just that. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/slave.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index c779e9bba1b3..e020b8a12b7d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -181,6 +181,7 @@ struct dsa_switch_driver { char *(*probe)(struct device *host_dev, int sw_addr); int (*setup)(struct dsa_switch *ds); int (*set_addr)(struct dsa_switch *ds, u8 *addr); + u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 90c9689ed362..a7997265019a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -371,6 +371,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, struct dsa_chip_data *cd = ds->pd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; + u32 phy_flags = 0; int ret; port_dn = cd->port_dn[p->port]; @@ -390,9 +391,12 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_dn = port_dn; } + if (ds->drv->get_phy_flags) + phy_flags = ds->drv->get_phy_flags(ds, p->port); + if (phy_dn) p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, 0, + dsa_slave_adjust_link, phy_flags, p->phy_interface); if (p->phy && phy_is_fixed) @@ -480,6 +484,9 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, netif_carrier_off(slave_dev); if (p->phy != NULL) { + if (ds->drv->get_phy_flags(ds, port)) + p->phy->dev_flags |= ds->drv->get_phy_flags(ds, port); + phy_attach(slave_dev, dev_name(&p->phy->dev), PHY_INTERFACE_MODE_GMII); -- cgit v1.2.3 From 23461551c00628c3f3fe9cf837bf53cf8f212b63 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:56 -0700 Subject: fou: Support for foo-over-udp RX path This patch provides a receive path for foo-over-udp. This allows direct encapsulation of IP protocols over UDP. The bound destination port is used to map to an IP protocol, and the XFRM framework (udp_encap_rcv) is used to receive encapsulated packets. Upon reception, the encapsulation header is logically removed (pointer to transport header is advanced) and the packet is reinjected into the receive path with the IP protocol indicated by the mapping. Netlink is used to configure FOU ports. The configuration information includes the port number to bind to and the IP protocol corresponding to that port. This should support GRE/UDP (http://tools.ietf.org/html/draft-yong-tsvwg-gre-in-udp-encap-02), as will as the other IP tunneling protocols (IPIP, SIT). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 32 ++++++ net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/fou.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 322 insertions(+) create mode 100644 include/uapi/linux/fou.h create mode 100644 net/ipv4/fou.c (limited to 'include') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h new file mode 100644 index 000000000000..e03376de453d --- /dev/null +++ b/include/uapi/linux/fou.h @@ -0,0 +1,32 @@ +/* fou.h - FOU Interface */ + +#ifndef _UAPI_LINUX_FOU_H +#define _UAPI_LINUX_FOU_H + +/* NETLINK_GENERIC related info + */ +#define FOU_GENL_NAME "fou" +#define FOU_GENL_VERSION 0x1 + +enum { + FOU_ATTR_UNSPEC, + FOU_ATTR_PORT, /* u16 */ + FOU_ATTR_AF, /* u8 */ + FOU_ATTR_IPPROTO, /* u8 */ + + __FOU_ATTR_MAX, +}; + +#define FOU_ATTR_MAX (__FOU_ATTR_MAX - 1) + +enum { + FOU_CMD_UNSPEC, + FOU_CMD_ADD, + FOU_CMD_DEL, + + __FOU_CMD_MAX, +}; + +#define FOU_CMD_MAX (__FOU_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_FOU_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index dbc10d84161f..84f710b7472a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -311,6 +311,16 @@ config NET_UDP_TUNNEL tristate default n +config NET_FOU + tristate "IP: Foo (IP protocols) over UDP" + select XFRM + select NET_UDP_TUNNEL + ---help--- + Foo over UDP allows any IP protocol to be directly encapsulated + over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP + network mechanisms and optimizations for UDP (such as ECMP + and RSS) can be leveraged to provide better service. + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 8ee1cd4053ee..d78d404c596f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o +obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c new file mode 100644 index 000000000000..d44f97b79418 --- /dev/null +++ b/net/ipv4/fou.c @@ -0,0 +1,279 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(fou_lock); +static LIST_HEAD(fou_list); + +struct fou { + struct socket *sock; + u8 protocol; + u16 port; + struct list_head list; +}; + +struct fou_cfg { + u8 protocol; + struct udp_port_cfg udp_config; +}; + +static inline struct fou *fou_from_sock(struct sock *sk) +{ + return sk->sk_user_data; +} + +static int fou_udp_encap_recv_deliver(struct sk_buff *skb, + u8 protocol, size_t len) +{ + struct iphdr *iph = ip_hdr(skb); + + /* Remove 'len' bytes from the packet (UDP header and + * FOU header if present), modify the protocol to the one + * we found, and then call rcv_encap. + */ + iph->tot_len = htons(ntohs(iph->tot_len) - len); + __skb_pull(skb, len); + skb_postpull_rcsum(skb, udp_hdr(skb), len); + skb_reset_transport_header(skb); + + return -protocol; +} + +static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + + if (!fou) + return 1; + + return fou_udp_encap_recv_deliver(skb, fou->protocol, + sizeof(struct udphdr)); +} + +static int fou_add_to_port_list(struct fou *fou) +{ + struct fou *fout; + + spin_lock(&fou_lock); + list_for_each_entry(fout, &fou_list, list) { + if (fou->port == fout->port) { + spin_unlock(&fou_lock); + return -EALREADY; + } + } + + list_add(&fou->list, &fou_list); + spin_unlock(&fou_lock); + + return 0; +} + +static void fou_release(struct fou *fou) +{ + struct socket *sock = fou->sock; + struct sock *sk = sock->sk; + + udp_del_offload(&fou->udp_offloads); + + list_del(&fou->list); + + /* Remove hooks into tunnel socket */ + sk->sk_user_data = NULL; + + sock_release(sock); + + kfree(fou); +} + +static int fou_create(struct net *net, struct fou_cfg *cfg, + struct socket **sockp) +{ + struct fou *fou = NULL; + int err; + struct socket *sock = NULL; + struct sock *sk; + + /* Open UDP socket */ + err = udp_sock_create(net, &cfg->udp_config, &sock); + if (err < 0) + goto error; + + /* Allocate FOU port structure */ + fou = kzalloc(sizeof(*fou), GFP_KERNEL); + if (!fou) { + err = -ENOMEM; + goto error; + } + + sk = sock->sk; + + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ + fou->protocol = cfg->protocol; + fou->port = cfg->udp_config.local_udp_port; + udp_sk(sk)->encap_rcv = fou_udp_recv; + + udp_sk(sk)->encap_type = 1; + udp_encap_enable(); + + sk->sk_user_data = fou; + fou->sock = sock; + + udp_set_convert_csum(sk, true); + + sk->sk_allocation = GFP_ATOMIC; + + err = fou_add_to_port_list(fou); + if (err) + goto error; + + if (sockp) + *sockp = sock; + + return 0; + +error: + kfree(fou); + if (sock) + sock_release(sock); + + return err; +} + +static int fou_destroy(struct net *net, struct fou_cfg *cfg) +{ + struct fou *fou; + u16 port = cfg->udp_config.local_udp_port; + int err = -EINVAL; + + spin_lock(&fou_lock); + list_for_each_entry(fou, &fou_list, list) { + if (fou->port == port) { + fou_release(fou); + err = 0; + break; + } + } + spin_unlock(&fou_lock); + + return err; +} + +static struct genl_family fou_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, +}; + +static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, +}; + +static int parse_nl_config(struct genl_info *info, + struct fou_cfg *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->udp_config.family = AF_INET; + + if (info->attrs[FOU_ATTR_AF]) { + u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); + + if (family != AF_INET && family != AF_INET6) + return -EINVAL; + + cfg->udp_config.family = family; + } + + if (info->attrs[FOU_ATTR_PORT]) { + u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); + + cfg->udp_config.local_udp_port = port; + } + + if (info->attrs[FOU_ATTR_IPPROTO]) + cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + + return 0; +} + +static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + int err; + + err = parse_nl_config(info, &cfg); + if (err) + return err; + + return fou_create(&init_net, &cfg, NULL); +} + +static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + + parse_nl_config(info, &cfg); + + return fou_destroy(&init_net, &cfg); +} + +static const struct genl_ops fou_nl_ops[] = { + { + .cmd = FOU_CMD_ADD, + .doit = fou_nl_cmd_add_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_DEL, + .doit = fou_nl_cmd_rm_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static int __init fou_init(void) +{ + int ret; + + ret = genl_register_family_with_ops(&fou_nl_family, + fou_nl_ops); + + return ret; +} + +static void __exit fou_fini(void) +{ + struct fou *fou, *next; + + genl_unregister_family(&fou_nl_family); + + /* Close all the FOU sockets */ + + spin_lock(&fou_lock); + list_for_each_entry_safe(fou, next, &fou_list, list) + fou_release(fou); + spin_unlock(&fou_lock); +} + +module_init(fou_init); +module_exit(fou_fini); +MODULE_AUTHOR("Tom Herbert "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From afe93325bc02a5b2dea0cd7d78225de692265e6e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:57 -0700 Subject: fou: Add GRO support Implement fou_gro_receive and fou_gro_complete, and populate these in the correponsing udp_offloads for the socket. Added ipproto to udp_offloads and pass this from UDP to the fou GRO routine in proto field of napi_gro_cb structure. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +- net/ipv4/fou.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp_offload.c | 5 ++- 3 files changed, 95 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28d4378615e5..4354b4307e37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1874,7 +1874,7 @@ struct napi_gro_cb { /* jiffies when first packet was created/queued */ unsigned long age; - /* Used in ipv6_gro_receive() */ + /* Used in ipv6_gro_receive() and foo-over-udp */ u16 proto; /* Used in udp_gro_receive */ @@ -1925,6 +1925,7 @@ struct packet_offload { struct udp_offload { __be16 port; + u8 ipproto; struct offload_callbacks callbacks; }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d44f97b79418..dced89fbe480 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ struct fou { struct socket *sock; u8 protocol; u16 port; + struct udp_offload udp_offloads; struct list_head list; }; @@ -62,6 +64,69 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) sizeof(struct udphdr)); } +static struct sk_buff **fou_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + const struct net_offload **offloads) +{ + const struct net_offload *ops; + struct sk_buff **pp = NULL; + u8 proto = NAPI_GRO_CB(skb)->proto; + + rcu_read_lock(); + ops = rcu_dereference(offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + + return pp; +} + +static int fou_gro_complete(struct sk_buff *skb, int nhoff, + const struct net_offload **offloads) +{ + const struct net_offload *ops; + u8 proto = NAPI_GRO_CB(skb)->proto; + int err = -ENOSYS; + + rcu_read_lock(); + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct sk_buff **fou4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + return fou_gro_receive(head, skb, inet_offloads); +} + +static int fou4_gro_complete(struct sk_buff *skb, int nhoff) +{ + return fou_gro_complete(skb, nhoff, inet_offloads); +} + +static struct sk_buff **fou6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + return fou_gro_receive(head, skb, inet6_offloads); +} + +static int fou6_gro_complete(struct sk_buff *skb, int nhoff) +{ + return fou_gro_complete(skb, nhoff, inet6_offloads); +} + static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -134,6 +199,29 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; + switch (cfg->udp_config.family) { + case AF_INET: + fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete; + break; + case AF_INET6: + fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete; + break; + default: + err = -EPFNOSUPPORT; + goto error; + } + + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + if (cfg->udp_config.family == AF_INET) { + err = udp_add_offload(&fou->udp_offloads); + if (err) + goto error; + } + err = fou_add_to_port_list(fou); if (err) goto error; @@ -160,6 +248,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) spin_lock(&fou_lock); list_for_each_entry(fou, &fou_list, list) { if (fou->port == port) { + udp_del_offload(&fou->udp_offloads); fou_release(fou); err = 0; break; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index adab393b2fe5..d7c43f764c71 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -276,6 +276,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; pp = uo_priv->offload->callbacks.gro_receive(head, skb); out_unlock: @@ -329,8 +330,10 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) break; } - if (uo_priv != NULL) + if (uo_priv != NULL) { + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); return err; -- cgit v1.2.3 From 56328486539ddd07cbaafec7a542a2c8a3043623 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:58 -0700 Subject: net: Changes to ip_tunnel to support foo-over-udp encapsulation This patch changes IP tunnel to support (secondary) encapsulation, Foo-over-UDP. Changes include: 1) Adding tun_hlen as the tunnel header length, encap_hlen as the encapsulation header length, and hlen becomes the grand total of these. 2) Added common netlink define to support FOU encapsulation. 3) Routines to perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 19 ++++++++- include/uapi/linux/if_tunnel.h | 12 ++++++ net/ipv4/ip_tunnel.c | 91 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 8dd8cab88b87..7f538ba6e267 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) @@ -31,6 +32,13 @@ struct ip_tunnel_6rd_parm { }; #endif +struct ip_tunnel_encap { + __u16 type; + __u16 flags; + __be16 sport; + __be16 dport; +}; + struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; @@ -56,13 +64,18 @@ struct ip_tunnel { /* These four fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ __u32 o_seqno; /* The last output seqno */ - int hlen; /* Precalculated header length */ + int tun_hlen; /* Precalculated header length */ int mlink; struct ip_tunnel_dst __percpu *dst_cache; struct ip_tunnel_parm parms; + int encap_hlen; /* Encap header length (FOU,GUE) */ + struct ip_tunnel_encap encap; + + int hlen; /* tun_hlen + encap_hlen */ + /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; @@ -114,6 +127,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, @@ -131,6 +146,8 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); void ip_tunnel_setup(struct net_device *dev, int net_id); void ip_tunnel_dst_reset_all(struct ip_tunnel *t); +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap); /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 3bce9e9d9f7c..9fedca7d8dae 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -53,10 +53,22 @@ enum { IFLA_IPTUN_6RD_RELAY_PREFIX, IFLA_IPTUN_6RD_PREFIXLEN, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + IFLA_IPTUN_ENCAP_TYPE, + IFLA_IPTUN_ENCAP_FLAGS, + IFLA_IPTUN_ENCAP_SPORT, + IFLA_IPTUN_ENCAP_DPORT, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) +enum tunnel_encap_types { + TUNNEL_ENCAP_NONE, + TUNNEL_ENCAP_FOU, +}; + +#define TUNNEL_ENCAP_FLAG_CSUM (1<<0) +#define TUNNEL_ENCAP_FLAG_CSUM6 (1<<1) + /* SIT-mode i_flags */ #define SIT_ISATAP 0x0001 diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index afed1aac2638..e3a3dc91e49c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -55,6 +55,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -487,6 +488,91 @@ drop: } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); +static int ip_encap_hlen(struct ip_tunnel_encap *e) +{ + switch (e->type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return sizeof(struct udphdr); + default: + return -EINVAL; + } +} + +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap) +{ + int hlen; + + memset(&t->encap, 0, sizeof(t->encap)); + + hlen = ip_encap_hlen(ipencap); + if (hlen < 0) + return hlen; + + t->encap.type = ipencap->type; + t->encap.sport = ipencap->sport; + t->encap.dport = ipencap->dport; + t->encap.flags = ipencap->flags; + + t->encap_hlen = hlen; + t->hlen = t->encap_hlen + t->tun_hlen; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + size_t hdr_len, u8 *protocol, struct flowi4 *fl4) +{ + struct udphdr *uh; + __be16 sport; + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get length and hash before making space in skb */ + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; + + return 0; +} + +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4) +{ + switch (t->encap.type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return fou_build_header(skb, &t->encap, t->encap_hlen, + protocol, fl4); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(ip_tunnel_encap); + static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df) { @@ -536,7 +622,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, } void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, const u8 protocol) + const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; @@ -617,6 +703,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) + goto tx_error; + rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; if (!rt) { -- cgit v1.2.3 From 4565e9919cda747815547e2e5d7b78f15efbffdf Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:26:01 -0700 Subject: gre: Setup and TX path for gre/UDP foo-over-udp encapsulation Added netlink attrs to configure FOU encapsulation for GRE, netlink handling of these flags, and properly adjust MTU for encapsulation. ip_tunnel_encap is called from ip_tunnel_xmit to actually perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 4 ++ net/ipv4/ip_gre.c | 90 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 9fedca7d8dae..7c832afdfa94 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -106,6 +106,10 @@ enum { IFLA_GRE_ENCAP_LIMIT, IFLA_GRE_FLOWINFO, IFLA_GRE_FLAGS, + IFLA_GRE_ENCAP_TYPE, + IFLA_GRE_ENCAP_FLAGS, + IFLA_GRE_ENCAP_SPORT, + IFLA_GRE_ENCAP_DPORT, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9b842544aea3..829aff8bf723 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -239,7 +239,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->hlen); + gre_build_header(skb, &tpi, tunnel->tun_hlen); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -310,7 +310,7 @@ out: static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err = 0; + int err; struct ip_tunnel_parm p; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) @@ -470,13 +470,18 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; + int t_hlen; tunnel = netdev_priv(dev); - tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + + t_hlen = tunnel->hlen + sizeof(struct iphdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -628,6 +633,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->iph.frag_off = htons(IP_DF); } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipgre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; +} + static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); @@ -657,6 +696,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); @@ -666,6 +714,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); @@ -694,6 +751,14 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_PMTUDISC */ nla_total_size(1) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -714,6 +779,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, + t->encap.sport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, + t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -731,6 +807,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { -- cgit v1.2.3 From 54003f119c26573d3bb86a5efc64f3e5fd43b8c6 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 17 Sep 2014 23:23:12 +0200 Subject: net: fix sparse warnings in SNMP_UPD_PO_STATS(_BH) ptr used to be a non __percpu pointer (result of a this_cpu_ptr assignment, 7d720c3e4f0c4 ("percpu: add __percpu sparse annotations to net")). Since d25398df59b56 ("net: avoid reloads in SNMP_UPD_PO_STATS"), that's no longer the case, SNMP_UPD_PO_STATS uses this_cpu_add and ptr is now __percpu. Silence sparse warnings by preserving the original type and annotation, and remove the out-of-date comment. warning: incorrect type in initializer (different address spaces) expected unsigned long long *ptr got unsigned long long [noderef] * warning: incorrect type in initializer (different address spaces) expected void const [noderef] *__vpp_verify got unsigned long long * warning: incorrect type in initializer (different address spaces) expected void const [noderef] *__vpp_verify got unsigned long long * Signed-off-by: Sabrina Dubroca Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/snmp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/snmp.h b/include/net/snmp.h index f1f27fdbb0d5..8fd2f498782e 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -146,19 +146,15 @@ struct linux_xfrm_mib { #define SNMP_ADD_STATS(mib, field, addend) \ this_cpu_add(mib->mibs[field], addend) -/* - * Use "__typeof__(*mib) *ptr" instead of "__typeof__(mib) ptr" - * to make @ptr a non-percpu pointer. - */ #define SNMP_UPD_PO_STATS(mib, basefield, addend) \ do { \ - __typeof__(*mib->mibs) *ptr = mib->mibs; \ + __typeof__((mib->mibs) + 0) ptr = mib->mibs; \ this_cpu_inc(ptr[basefield##PKTS]); \ this_cpu_add(ptr[basefield##OCTETS], addend); \ } while (0) #define SNMP_UPD_PO_STATS_BH(mib, basefield, addend) \ do { \ - __typeof__(*mib->mibs) *ptr = mib->mibs; \ + __typeof__((mib->mibs) + 0) ptr = mib->mibs; \ __this_cpu_inc(ptr[basefield##PKTS]); \ __this_cpu_add(ptr[basefield##OCTETS], addend); \ } while (0) -- cgit v1.2.3 From 77507aa249aecd06fa25ad058b64481e46887a01 Mon Sep 17 00:00:00 2001 From: Ido Shamay Date: Thu, 18 Sep 2014 11:50:59 +0300 Subject: net/mlx4_core: Enable CQE/EQE stride support This feature is intended for archs having cache line larger then 64B. Since our CQE/EQEs are generally 64B in those systems, HW will write twice to the same cache line consecutively, causing pipe locks due to he hazard prevention mechanism. For elements in a cyclic buffer, writes are consecutive, so entries smaller than a cache line should be avoided, especially if they are written at a high rate. Reduce consecutive writes to same cache line in CQs/EQs, by allowing the driver to increase the distance between entries so that each will reside in a different cache line. Until the introduction of this feature, there were two types of CQE/EQE: 1. 32B stride and context in the [0-31] segment 2. 64B stride and context in the [32-63] segment This feature introduces two additional types: 3. 128B stride and context in the [0-31] segment (128B cache line) 4. 256B stride and context in the [0-31] segment (256B cache line) Modify the mlx4_core driver to query the device for the CQE/EQE cache line stride capability and to enable that capability when the host cache line size is larger than 64 bytes (supported cache lines are 128B and 256B). The mlx4 IB driver and libmlx4 need not be aware of this change. The PF context behaviour is changed to require this change in VF drivers running on such archs. Signed-off-by: Ido Shamay Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 38 ++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/fw.h | 2 + drivers/net/ethernet/mellanox/mlx4/main.c | 61 ++++++++++++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++ include/linux/mlx4/device.h | 11 ++++-- 5 files changed, 108 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 494753e44ae3..13b2e4a51ef4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -137,7 +137,9 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [8] = "Dynamic QP updates support", [9] = "Device managed flow steering IPoIB support", [10] = "TCP/IP offloads/flow-steering for VXLAN support", - [11] = "MAD DEMUX (Secure-Host) support" + [11] = "MAD DEMUX (Secure-Host) support", + [12] = "Large cache line (>64B) CQE stride support", + [13] = "Large cache line (>64B) EQE stride support" }; int i; @@ -557,6 +559,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET 0x74 #define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET 0x76 #define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET 0x77 +#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE 0x7a #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84 @@ -733,6 +736,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->max_rq_sg = field; MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET); dev_cap->max_rq_desc_sz = size; + MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE); + if (field & (1 << 6)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + if (field & (1 << 7)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE; MLX4_GET(dev_cap->bmme_flags, outbox, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); @@ -1376,6 +1384,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) #define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) #define INIT_HCA_EQE_CQE_OFFSETS (INIT_HCA_QPC_OFFSET + 0x38) +#define INIT_HCA_EQE_CQE_STRIDE_OFFSET (INIT_HCA_QPC_OFFSET + 0x3b) #define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) #define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) #define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) @@ -1452,11 +1461,25 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) { *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30); dev->caps.cqe_size = 64; - dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; } else { dev->caps.cqe_size = 32; } + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) && + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) { + dev->caps.eqe_size = cache_line_size(); + dev->caps.cqe_size = cache_line_size(); + dev->caps.eqe_factor = 0; + MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 | + (ilog2(dev->caps.eqe_size) - 5)), + INIT_HCA_EQE_CQE_STRIDE_OFFSET); + + /* User still need to know to support CQE > 32B */ + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } + /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); @@ -1616,6 +1639,17 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, if (byte_field & 0x40) /* 64-bytes cqe enabled */ param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET); + if (byte_field) { + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + param->cqe_size = 1 << ((byte_field & + MLX4_CQE_SIZE_MASK_STRIDE) + 5); + param->eqe_size = 1 << (((byte_field & + MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5); + } + /* TPT attributes */ MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 1fce03ebe5c4..9b835aecac96 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -178,6 +178,8 @@ struct mlx4_init_hca_param { u8 uar_page_sz; /* log pg sz in 4k chunks */ u8 steering_mode; /* for QUERY_HCA */ u64 dev_cap_enabled; + u16 cqe_size; /* For use only when CQE stride feature enabled */ + u16 eqe_size; /* For use only when EQE stride feature enabled */ }; struct mlx4_init_ib_param { diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 7e2d5d57c598..1f10023af1db 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -104,7 +104,8 @@ module_param(enable_64b_cqe_eqe, bool, 0444); MODULE_PARM_DESC(enable_64b_cqe_eqe, "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)"); -#define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE +#define PF_CONTEXT_BEHAVIOUR_MASK (MLX4_FUNC_CAP_64B_EQE_CQE | \ + MLX4_FUNC_CAP_EQE_CQE_STRIDE) static char mlx4_version[] = DRV_NAME ": Mellanox ConnectX core driver v" @@ -196,6 +197,40 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev) dev->caps.port_mask[i] = dev->caps.port_type[i]; } +static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev) +{ + struct mlx4_caps *dev_cap = &dev->caps; + + /* FW not supporting or cancelled by user */ + if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) || + !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) + return; + + /* Must have 64B CQE_EQE enabled by FW to use bigger stride + * When FW has NCSI it may decide not to report 64B CQE/EQEs + */ + if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) || + !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) { + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + return; + } + + if (cache_line_size() == 128 || cache_line_size() == 256) { + mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n"); + /* Changing the real data inside CQE size to 32B */ + dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; + dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; + + if (mlx4_is_master(dev)) + dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE; + } else { + mlx4_dbg(dev, "Disabling CQE stride cacheLine unsupported\n"); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + } +} + static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; @@ -390,6 +425,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; } + + if (dev_cap->flags2 & + (MLX4_DEV_CAP_FLAG2_CQE_STRIDE | + MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) { + mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n"); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + } } if ((dev->caps.flags & @@ -397,6 +440,9 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) mlx4_is_master(dev)) dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; + if (!mlx4_is_slave(dev)) + mlx4_enable_cqe_eqe_stride(dev); + return 0; } @@ -724,11 +770,22 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { dev->caps.cqe_size = 64; - dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; } else { dev->caps.cqe_size = 32; } + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) { + dev->caps.eqe_size = hca_param.eqe_size; + dev->caps.eqe_factor = 0; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) { + dev->caps.cqe_size = hca_param.cqe_size; + /* User still need to know when CQE > 32B */ + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_warn(dev, "Timestamping is not supported in slave mode\n"); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index b508c7887ef8..de10dbb2e6ed 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -285,6 +285,9 @@ struct mlx4_icm_table { #define MLX4_MPT_STATUS_SW 0xF0 #define MLX4_MPT_STATUS_HW 0x00 +#define MLX4_CQE_SIZE_MASK_STRIDE 0x3 +#define MLX4_EQE_SIZE_MASK_STRIDE 0x30 + /* * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 1befd8df9cfc..7bcefe749a39 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -185,19 +185,24 @@ enum { MLX4_DEV_CAP_FLAG2_DMFS_IPOIB = 1LL << 9, MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS = 1LL << 10, MLX4_DEV_CAP_FLAG2_MAD_DEMUX = 1LL << 11, + MLX4_DEV_CAP_FLAG2_CQE_STRIDE = 1LL << 12, + MLX4_DEV_CAP_FLAG2_EQE_STRIDE = 1LL << 13 }; enum { MLX4_DEV_CAP_64B_EQE_ENABLED = 1LL << 0, - MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1 + MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1, + MLX4_DEV_CAP_CQE_STRIDE_ENABLED = 1LL << 2, + MLX4_DEV_CAP_EQE_STRIDE_ENABLED = 1LL << 3 }; enum { - MLX4_USER_DEV_CAP_64B_CQE = 1L << 0 + MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0 }; enum { - MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0 + MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0, + MLX4_FUNC_CAP_EQE_CQE_STRIDE = 1L << 1 }; -- cgit v1.2.3 From 2446254915a7d6f08bba9a755a34cc0402880472 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Sep 2014 17:31:22 -0700 Subject: net: dsa: allow switch drivers to implement suspend/resume hooks Add an abstraction layer to suspend/resume switch devices, doing the following split: - suspend/resume the slave network devices and their corresponding PHY devices - suspend/resume the switch hardware using switch driver callbacks Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++ net/dsa/dsa.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa_priv.h | 2 ++ net/dsa/slave.c | 31 +++++++++++++++++++++ 4 files changed, 119 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e020b8a12b7d..846dce4abaa5 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -210,6 +210,12 @@ struct dsa_switch_driver { void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds); + + /* + * Suspend and resume + */ + int (*suspend)(struct dsa_switch *ds); + int (*resume)(struct dsa_switch *ds); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6e40928ec0e7..6905f2d84c44 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -238,6 +238,49 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { } +static int dsa_switch_suspend(struct dsa_switch *ds) +{ + int i, ret = 0; + + /* Suspend slave network devices */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!(ds->phys_port_mask & (1 << i))) + continue; + + ret = dsa_slave_suspend(ds->ports[i]); + if (ret) + return ret; + } + + if (ds->drv->suspend) + ret = ds->drv->suspend(ds); + + return ret; +} + +static int dsa_switch_resume(struct dsa_switch *ds) +{ + int i, ret = 0; + + if (ds->drv->resume) + ret = ds->drv->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!(ds->phys_port_mask & (1 << i))) + continue; + + ret = dsa_slave_resume(ds->ports[i]); + if (ret) + return ret; + } + + return 0; +} + /* link polling *************************************************************/ static void dsa_link_poll_work(struct work_struct *ugly) @@ -650,6 +693,42 @@ static struct packet_type dsa_pack_type __read_mostly = { .func = dsa_switch_rcv, }; +#ifdef CONFIG_PM_SLEEP +static int dsa_suspend(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_suspend(ds); + } + + return ret; +} + +static int dsa_resume(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_resume(ds); + } + + return ret; +} +#endif + +static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); + static const struct of_device_id dsa_of_match_table[] = { { .compatible = "brcm,bcm7445-switch-v4.0" }, { .compatible = "marvell,dsa", }, @@ -665,6 +744,7 @@ static struct platform_driver dsa_driver = { .name = "dsa", .owner = THIS_MODULE, .of_match_table = dsa_of_match_table, + .pm = &dsa_pm_ops, }, }; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f90899e8ab5a..dc9756d3154c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -56,6 +56,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds); struct net_device *dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name); +int dsa_slave_suspend(struct net_device *slave_dev); +int dsa_slave_resume(struct net_device *slave_dev); /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a7997265019a..143811ef57ae 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -412,6 +412,37 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, p->phy->addr, p->phy->drv->name); } +int dsa_slave_suspend(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_device_detach(slave_dev); + + if (p->phy) { + phy_stop(p->phy); + p->old_pause = -1; + p->old_link = -1; + p->old_duplex = -1; + phy_suspend(p->phy); + } + + return 0; +} + +int dsa_slave_resume(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_device_attach(slave_dev); + + if (p->phy) { + phy_resume(p->phy); + phy_start(p->phy); + } + + return 0; +} + struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) -- cgit v1.2.3 From 19e57c4e6dc6b82a3204b801f4c5f27c7d007559 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Sep 2014 17:31:24 -0700 Subject: net: dsa: add {get, set}_wol callbacks to slave devices Allow switch drivers to implement per-port Wake-on-LAN getter and setters. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ net/dsa/slave.c | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 846dce4abaa5..d8054fb4a4df 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -211,6 +211,14 @@ struct dsa_switch_driver { int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds); + /* + * ethtool Wake-on-LAN + */ + void (*get_wol)(struct dsa_switch *ds, int port, + struct ethtool_wolinfo *w); + int (*set_wol)(struct dsa_switch *ds, int port, + struct ethtool_wolinfo *w); + /* * Suspend and resume */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 143811ef57ae..43c1e4ade689 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -301,6 +301,27 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) return -EOPNOTSUPP; } +static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->get_wol) + ds->drv->get_wol(ds, p->port, w); +} + +static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->set_wol) + ret = ds->drv->set_wol(ds, p->port, w); + + return ret; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -310,6 +331,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, + .set_wol = dsa_slave_set_wol, + .get_wol = dsa_slave_get_wol, }; static const struct net_device_ops dsa_slave_netdev_ops = { -- cgit v1.2.3 From 35f7aa5309c048bb70e58571942795fa9411ce6a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Sep 2014 14:03:55 +0200 Subject: ipv6: mld: answer mldv2 queries with mldv1 reports in mldv1 fallback RFC2710 (MLDv1), section 3.7. says: The length of a received MLD message is computed by taking the IPv6 Payload Length value and subtracting the length of any IPv6 extension headers present between the IPv6 header and the MLD message. If that length is greater than 24 octets, that indicates that there are other fields present *beyond* the fields described above, perhaps belonging to a *future backwards-compatible* version of MLD. An implementation of the version of MLD specified in this document *MUST NOT* send an MLD message longer than 24 octets and MUST ignore anything past the first 24 octets of a received MLD message. RFC3810 (MLDv2), section 8.2.1. states for *listeners* regarding presence of MLDv1 routers: In order to be compatible with MLDv1 routers, MLDv2 hosts MUST operate in version 1 compatibility mode. [...] When Host Compatibility Mode is MLDv2, a host acts using the MLDv2 protocol on that interface. When Host Compatibility Mode is MLDv1, a host acts in MLDv1 compatibility mode, using *only* the MLDv1 protocol, on that interface. [...] While section 8.3.1. specifies *router* behaviour regarding presence of MLDv1 routers: MLDv2 routers may be placed on a network where there is at least one MLDv1 router. The following requirements apply: If an MLDv1 router is present on the link, the Querier MUST use the *lowest* version of MLD present on the network. This must be administratively assured. Routers that desire to be compatible with MLDv1 MUST have a configuration option to act in MLDv1 mode; if an MLDv1 router is present on the link, the system administrator must explicitly configure all MLDv2 routers to act in MLDv1 mode. When in MLDv1 mode, the Querier MUST send periodic General Queries truncated at the Multicast Address field (i.e., 24 bytes long), and SHOULD also warn about receiving an MLDv2 Query (such warnings must be rate-limited). The Querier MUST also fill in the Maximum Response Delay in the Maximum Response Code field, i.e., the exponential algorithm described in section 5.1.3. is not used. [...] That means that we should not get queries from different versions of MLD. When there's a MLDv1 router present, MLDv2 enforces truncation and MRC == MRD (both fields are overlapping within the 24 octet range). Section 8.3.2. specifies behaviour in the presence of MLDv1 multicast address *listeners*: MLDv2 routers may be placed on a network where there are hosts that have not yet been upgraded to MLDv2. In order to be compatible with MLDv1 hosts, MLDv2 routers MUST operate in version 1 compatibility mode. MLDv2 routers keep a compatibility mode per multicast address record. The compatibility mode of a multicast address is determined from the Multicast Address Compatibility Mode variable, which can be in one of the two following states: MLDv1 or MLDv2. The Multicast Address Compatibility Mode of a multicast address record is set to MLDv1 whenever an MLDv1 Multicast Listener Report is *received* for that multicast address. At the same time, the Older Version Host Present timer for the multicast address is set to Older Version Host Present Timeout seconds. The timer is re-set whenever a new MLDv1 Report is received for that multicast address. If the Older Version Host Present timer expires, the router switches back to Multicast Address Compatibility Mode of MLDv2 for that multicast address. [...] That means, what can happen is the following scenario, that hosts can act in MLDv1 compatibility mode when they previously have received an MLDv1 query (or, simply operate in MLDv1 mode-only); and at the same time, an MLDv2 router could start up and transmits MLDv2 startup query messages while being unaware of the current operational mode. Given RFC2710, section 3.7 we would need to answer to that with an MLDv1 listener report, so that the router according to RFC3810, section 8.3.2. would receive that and internally switch to MLDv1 compatibility as well. Right now, I believe since the initial implementation of MLDv2, Linux hosts would just silently drop such MLDv2 queries instead of replying with an MLDv1 listener report, which would prevent a MLDv2 router going into fallback mode (until it receives other MLDv1 queries). Since the mapping of MRC to MRD in exactly such cases can make use of the exponential algorithm from 5.1.3, we cannot [strictly speaking] be aware in MLDv1 of the encoding in MRC, it seems also not mentioned by the RFC. Since encodings are the same up to 32767, assume in such a situation this value as a hard upper limit we would clamp. We have asked one of the RFC authors on that regard, and he mentioned that there seem not to be any implementations that make use of that exponential algorithm on startup messages. In any case, this patch fixes this MLD interoperability issue. Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/mld.h | 5 ++++- net/ipv6/mcast.c | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/mld.h b/include/net/mld.h index faa1d161bf24..01d751303498 100644 --- a/include/net/mld.h +++ b/include/net/mld.h @@ -88,12 +88,15 @@ struct mld2_query { #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) +#define MLD_EXP_MIN_LIMIT 32768UL +#define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) + static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); - if (mc_mrc < 32768) { + if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 592eba61e78a..9648de2b6745 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1237,7 +1237,7 @@ static void mld_update_qri(struct inet6_dev *idev, } static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, - unsigned long *max_delay) + unsigned long *max_delay, bool v1_query) { unsigned long mldv1_md; @@ -1245,11 +1245,32 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, if (mld_in_v2_mode_only(idev)) return -EINVAL; - /* MLDv1 router present */ mldv1_md = ntohs(mld->mld_maxdelay); + + /* When in MLDv1 fallback and a MLDv2 router start-up being + * unaware of current MLDv1 operation, the MRC == MRD mapping + * only works when the exponential algorithm is not being + * used (as MLDv1 is unaware of such things). + * + * According to the RFC author, the MLDv2 implementations + * he's aware of all use a MRC < 32768 on start up queries. + * + * Thus, should we *ever* encounter something else larger + * than that, just assume the maximum possible within our + * reach. + */ + if (!v1_query) + mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT); + *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); - mld_set_v1_mode(idev); + /* MLDv1 router present: we need to go into v1 mode *only* + * when an MLDv1 query is received as per section 9.12. of + * RFC3810! And we know from RFC2710 section 3.7 that MLDv1 + * queries MUST be of exactly 24 octets. + */ + if (v1_query) + mld_set_v1_mode(idev); /* cancel MLDv2 report timer */ mld_gq_stop_timer(idev); @@ -1264,10 +1285,6 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, unsigned long *max_delay) { - /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ - if (mld_in_v1_mode(idev)) - return -EINVAL; - *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); mld_update_qrv(idev, mld); @@ -1324,8 +1341,11 @@ int igmp6_event_query(struct sk_buff *skb) !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - if (len == MLD_V1_QUERY_LEN) { - err = mld_process_v1(idev, mld, &max_delay); + if (len < MLD_V1_QUERY_LEN) { + return -EINVAL; + } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { + err = mld_process_v1(idev, mld, &max_delay, + len == MLD_V1_QUERY_LEN); if (err < 0) return err; } else if (len >= MLD_V2_QUERY_LEN_MIN) { @@ -1357,8 +1377,9 @@ int igmp6_event_query(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } - } else + } else { return -EINVAL; + } read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { -- cgit v1.2.3 From fcdd1cf4dd63aecf86c987d7f4ec7187be5c2fbc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 13:19:44 -0700 Subject: tcp: avoid possible arithmetic overflows icsk_rto is a 32bit field, and icsk_backoff can reach 15 by default, or more if some sysctl (eg tcp_retries2) are changed. Better use 64bit to perform icsk_rto << icsk_backoff operations As Joe Perches suggested, add a helper for this. Yuchung spotted the tcp_v4_err() case. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 9 +++++++++ net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv4/tcp_output.c | 13 ++++++------- net/ipv4/tcp_timer.c | 4 ++-- 5 files changed, 23 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5fbe6568c3cf..848e85cb5c61 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -242,6 +242,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, #endif } +static inline unsigned long +inet_csk_rto_backoff(const struct inet_connection_sock *icsk, + unsigned long max_when) +{ + u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff; + + return (unsigned long)min_t(u64, when, max_when); +} + struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); struct request_sock *inet_csk_search_req(const struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 02fb66d4a018..13f3da4762e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3208,9 +3208,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { + unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + when, TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 006b045716d8..3b2e49cb2b61 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -430,9 +430,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT) << icsk->icsk_backoff; - tcp_bound_rto(sk); + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : + TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_write_queue_head(sk); BUG_ON(!skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7f1280dcad57..8c61a7c0c889 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3279,6 +3279,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long probe_max; int err; err = tcp_write_wakeup(sk); @@ -3294,9 +3295,7 @@ void tcp_send_probe0(struct sock *sk) if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + probe_max = TCP_RTO_MAX; } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember icsk_probes_out. @@ -3306,11 +3305,11 @@ void tcp_send_probe0(struct sock *sk) */ if (!icsk->icsk_probes_out) icsk->icsk_probes_out = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + probe_max = TCP_RESOURCE_PROBE_INTERVAL; } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + inet_csk_rto_backoff(icsk, probe_max), + TCP_RTO_MAX); } int tcp_rtx_synack(struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a339e7ba05a4..b24360f6e293 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -180,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = (icsk->icsk_rto < TCP_RTO_MAX); + const int alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -294,7 +294,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); + const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); -- cgit v1.2.3 From 4cdf507d54525842dfd9f6313fdafba039084046 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2014 07:38:40 -0700 Subject: icmp: add a global rate limitation Current ICMP rate limiting uses inetpeer cache, which is an RBL tree protected by a lock, meaning that hosts can be stuck hard if all cpus want to check ICMP limits. When say a DNS or NTP server process is restarted, inetpeer tree grows quick and machine comes to its knees. iptables can not help because the bottleneck happens before ICMP messages are even cooked and sent. This patch adds a new global limitation, using a token bucket filter, controlled by two new sysctl : icmp_msgs_per_sec - INTEGER Limit maximal number of ICMP packets sent per second from this host. Only messages whose type matches icmp_ratemask are controlled by this limit. Default: 1000 icmp_msgs_burst - INTEGER icmp_msgs_per_sec controls number of ICMP packets sent per second, while icmp_msgs_burst controls the burst size of these packets. Default: 50 Note that if we really want to send millions of ICMP messages per second, we might extend idea and infra added in commit 04ca6973f7c1a ("ip: make IP identifiers less predictable") : add a token bucket in the ip_idents hash and no longer rely on inetpeer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++ include/net/ip.h | 4 +++ net/ipv4/icmp.c | 64 +++++++++++++++++++++++++++++++--- net/ipv4/sysctl_net_ipv4.c | 16 +++++++++ net/ipv6/icmp.c | 20 ++++++----- 5 files changed, 105 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1b5581a30d77..c7a81ace35d0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -769,8 +769,21 @@ icmp_ratelimit - INTEGER icmp_ratemask (see below) to specific targets. 0 to disable any limiting, otherwise the minimal space between responses in milliseconds. + Note that another sysctl, icmp_msgs_per_sec limits the number + of ICMP packets sent on all targets. Default: 1000 +icmp_msgs_per_sec - INTEGER + Limit maximal number of ICMP packets sent per second from this host. + Only messages whose type matches icmp_ratemask (see below) are + controlled by this limit. + Default: 1000 + +icmp_msgs_burst - INTEGER + icmp_msgs_per_sec controls number of ICMP packets sent per second, + while icmp_msgs_burst controls the burst size of these packets. + Default: 50 + icmp_ratemask - INTEGER Mask made of ICMP types for which rates are being limited. Significant bits: IHGFEDCBA9876543210 diff --git a/include/net/ip.h b/include/net/ip.h index 14bfc8e1bcf9..fcd9068fb8c3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -548,6 +548,10 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, u32 info); +bool icmp_global_allow(void); +extern int sysctl_icmp_msgs_per_sec; +extern int sysctl_icmp_msgs_burst; + #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ea7d4afe8205..5882f584910e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock_bh(&sk->sk_lock.slock); } +int sysctl_icmp_msgs_per_sec __read_mostly = 1000; +int sysctl_icmp_msgs_burst __read_mostly = 50; + +static struct { + spinlock_t lock; + u32 credit; + u32 stamp; +} icmp_global = { + .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), +}; + +/** + * icmp_global_allow - Are we allowed to send one more ICMP message ? + * + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Returns false if we reached the limit and can not send another packet. + * Note: called with BH disabled + */ +bool icmp_global_allow(void) +{ + u32 credit, delta, incr = 0, now = (u32)jiffies; + bool rc = false; + + /* Check if token bucket is empty and cannot be refilled + * without taking the spinlock. + */ + if (!icmp_global.credit) { + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta < HZ / 50) + return false; + } + + spin_lock(&icmp_global.lock); + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta >= HZ / 50) { + incr = sysctl_icmp_msgs_per_sec * delta / HZ ; + if (incr) + icmp_global.stamp = now; + } + credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); + if (credit) { + credit--; + rc = true; + } + icmp_global.credit = credit; + spin_unlock(&icmp_global.lock); + return rc; +} +EXPORT_SYMBOL(icmp_global_allow); + /* * Send an ICMP frame. */ -static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) +static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; bool rc = true; @@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + goto out; + + rc = false; + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1599966f4639..8a25509c35b3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -730,6 +730,22 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "icmp_msgs_per_sec", + .data = &sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "icmp_msgs_burst", + .data = &sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 394bb824fe4b..141e1f3ab74e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -170,11 +170,11 @@ static bool is_ineligible(const struct sk_buff *skb) /* * Check the ICMP output rate limit */ -static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) +static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) { - struct dst_entry *dst; struct net *net = sock_net(sk); + struct dst_entry *dst; bool res = false; /* Informational messages are not limited. */ @@ -199,16 +199,20 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; - struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v6(net->ipv6.peers, + &rt->rt6i_dst.addr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); + } } dst_release(dst); return res; -- cgit v1.2.3 From 9e87f9a9c4c4754508b2c2638fbde9e10c7a103b Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sat, 13 Sep 2014 10:28:49 +0200 Subject: NFC: nci: Add support for proprietary RF Protocols In NFC Forum NCI specification, some RF Protocol values are reserved for proprietary use (from 0x80 to 0xfe). Some CLF vendor may need to use one value within this range for specific technology. Furthermore, some CLF may not becompliant with NFC Froum NCI specification 2.0 and therefore will not support RF Protocol value 0x06 for PROTOCOL_T5T as mention in a draft specification and in a recent push. Adding get_rf_protocol handle to the nci_ops structure will help to set the correct technology to target. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- drivers/nfc/st21nfcb/st21nfcb.c | 10 ++++++++++ include/net/nfc/nci_core.h | 9 +++++---- net/nfc/nci/ntf.c | 9 ++++++++- 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/nfc/st21nfcb/st21nfcb.c b/drivers/nfc/st21nfcb/st21nfcb.c index 90d37156169e..e72dae816e72 100644 --- a/drivers/nfc/st21nfcb/st21nfcb.c +++ b/drivers/nfc/st21nfcb/st21nfcb.c @@ -25,6 +25,8 @@ #define DRIVER_DESC "NCI NFC driver for ST21NFCB" +#define ST21NFCB_NCI1_X_PROPRIETARY_ISO15693 0x83 + static int st21nfcb_nci_open(struct nci_dev *ndev) { struct st21nfcb_nci_info *info = nci_get_drvdata(ndev); @@ -64,10 +66,18 @@ static int st21nfcb_nci_send(struct nci_dev *ndev, struct sk_buff *skb) return ndlc_send(info->ndlc, skb); } +static __u32 st21nfcb_nci_get_rfprotocol(struct nci_dev *ndev, + __u8 rf_protocol) +{ + return rf_protocol == ST21NFCB_NCI1_X_PROPRIETARY_ISO15693 ? + NFC_PROTO_ISO15693_MASK : 0; +} + static struct nci_ops st21nfcb_nci_ops = { .open = st21nfcb_nci_open, .close = st21nfcb_nci_close, .send = st21nfcb_nci_send, + .get_rfprotocol = st21nfcb_nci_get_rfprotocol, }; int st21nfcb_nci_probe(struct llt_ndlc *ndlc, int phy_headroom, diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 1f9a0f5272fe..75d10e625c49 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -64,10 +64,11 @@ enum nci_state { struct nci_dev; struct nci_ops { - int (*open)(struct nci_dev *ndev); - int (*close)(struct nci_dev *ndev); - int (*send)(struct nci_dev *ndev, struct sk_buff *skb); - int (*setup)(struct nci_dev *ndev); + int (*open)(struct nci_dev *ndev); + int (*close)(struct nci_dev *ndev); + int (*send)(struct nci_dev *ndev, struct sk_buff *skb); + int (*setup)(struct nci_dev *ndev); + __u32 (*get_rfprotocol)(struct nci_dev *ndev, __u8 rf_protocol); }; #define NCI_MAX_SUPPORTED_RF_INTERFACES 4 diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 25e44cebd60a..205b35f666db 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -167,6 +167,13 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, return data; } +__u32 nci_get_prop_rf_protocol(struct nci_dev *ndev, __u8 rf_protocol) +{ + if (ndev->ops->get_rfprotocol) + return ndev->ops->get_rfprotocol(ndev, rf_protocol); + return 0; +} + static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, @@ -195,7 +202,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, else if (rf_protocol == NCI_RF_PROTOCOL_T5T) protocol = NFC_PROTO_ISO15693_MASK; else - protocol = 0; + protocol = nci_get_prop_rf_protocol(ndev, rf_protocol); if (!(protocol & ndev->poll_prots)) { pr_err("the target found does not have the desired protocol\n"); -- cgit v1.2.3 From 2b0bf6c85a4940e00516f68ff7103329abf8512d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 22 Sep 2014 11:17:41 -0700 Subject: Bluetooth: Convert bt_ logging functions to return void No caller or macro uses the return value so make all the functions return void. Signed-off-by: Joe Perches Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 4 ++-- net/bluetooth/lib.c | 14 ++++---------- 2 files changed, 6 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 7e666d06b97f..58695ffeb138 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -120,9 +120,9 @@ struct bt_voice { #define BT_RCVMTU 13 __printf(1, 2) -int bt_info(const char *fmt, ...); +void bt_info(const char *fmt, ...); __printf(1, 2) -int bt_err(const char *fmt, ...); +void bt_err(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 941ad7530eda..b36bc0415854 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -135,40 +135,34 @@ int bt_to_errno(__u16 code) } EXPORT_SYMBOL(bt_to_errno); -int bt_info(const char *format, ...) +void bt_info(const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = pr_info("%pV", &vaf); + pr_info("%pV", &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(bt_info); -int bt_err(const char *format, ...) +void bt_err(const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = pr_err("%pV", &vaf); + pr_err("%pV", &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(bt_err); -- cgit v1.2.3 From d41c15cf95bd91b9c333f6f749670e22c8a47ad9 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 24 Sep 2014 13:14:46 +0300 Subject: Bluetooth: Fix reason code used for rejecting SCO connections The core specification defines valid values for the HCI_Reject_Synchronous_Connection_Request command to be 0x0D-0x0F. So far the code has been using HCI_ERROR_REMOTE_USER_TERM (0x13) which is not a valid value and is therefore being rejected by some controllers: > HCI Event: Connect Request (0x04) plen 10 bdaddr 40:6F:2A:6A:E5:E0 class 0x000000 type eSCO < HCI Command: Reject Synchronous Connection (0x01|0x002a) plen 7 bdaddr 40:6F:2A:6A:E5:E0 reason 0x13 Reason: Remote User Terminated Connection > HCI Event: Command Status (0x0f) plen 4 Reject Synchronous Connection (0x01|0x002a) status 0x12 ncmd 1 Error: Invalid HCI Command Parameters This patch introduces a new define for a value from the valid range (0x0d == Connection Rejected Due To Limited Resources) and uses it instead for rejecting incoming connections. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_conn.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3f8547f1c6f8..6e8f24967308 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -385,6 +385,7 @@ enum { #define HCI_ERROR_AUTH_FAILURE 0x05 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 +#define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f #define HCI_ERROR_REMOTE_USER_TERM 0x13 #define HCI_ERROR_REMOTE_LOW_RESOURCES 0x14 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 22b253750f78..445829cd363c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -121,7 +121,7 @@ static void hci_reject_sco(struct hci_conn *conn) { struct hci_cp_reject_sync_conn_req cp; - cp.reason = HCI_ERROR_REMOTE_USER_TERM; + cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; bacpy(&cp.bdaddr, &conn->dst); hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); -- cgit v1.2.3 From 53e50398968d43338c4d932114e68bc099fc5fbd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 20 Sep 2014 14:52:30 -0700 Subject: net: Remove gso_send_check as an offload callback The send_check logic was only interesting in cases of TCP offload and UDP UFO where the checksum needed to be initialized to the pseudo header checksum. Now we've moved that logic into the related gso_segment functions so gso_send_check is no longer needed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 10 ---------- net/ipv4/af_inet.c | 36 ------------------------------------ net/ipv4/gre_offload.c | 11 +++-------- net/ipv4/tcp_offload.c | 6 ------ net/ipv4/udp_offload.c | 6 ------ net/ipv6/ip6_offload.c | 27 --------------------------- net/ipv6/tcpv6_offload.c | 6 ------ net/ipv6/udp_offload.c | 6 ------ net/mpls/mpls_gso.c | 7 ------- 10 files changed, 3 insertions(+), 113 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4354b4307e37..9f5d293a0281 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1911,7 +1911,6 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - int (*gso_send_check)(struct sk_buff *skb); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); diff --git a/net/core/dev.c b/net/core/dev.c index db0388607329..e2ced01c01e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2422,16 +2422,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } segs = ptype->callbacks.gso_segment(skb, features); break; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 72011cc4c13b..28e589c5f32d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1197,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); -static int inet_gso_send_check(struct sk_buff *skb) -{ - const struct net_offload *ops; - const struct iphdr *iph; - int proto; - int ihl; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) - goto out; - - iph = ip_hdr(skb); - ihl = iph->ihl * 4; - if (ihl < sizeof(*iph)) - goto out; - - proto = iph->protocol; - - /* Warning: after this point, iph might be no longer valid */ - if (unlikely(!pskb_may_pull(skb, ihl))) - goto out; - __skb_pull(skb, ihl); - - skb_reset_transport_header(skb); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_send_check)) - err = ops->callbacks.gso_send_check(skb); - -out: - return err; -} - static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -1655,7 +1621,6 @@ static int ipv4_proc_init(void); static struct packet_offload ip_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, @@ -1664,7 +1629,6 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d3fe2ac05167..a77729503071 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,13 +15,6 @@ #include #include -static int gre_gso_send_check(struct sk_buff *skb) -{ - if (!skb->encapsulation) - return -EINVAL; - return 0; -} - static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_IPIP))) goto out; + if (!skb->encapsulation) + goto out; + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) goto out; @@ -256,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload gre_offload = { .callbacks = { - .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 7cd12b0458ff..5b90f2f447a5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -288,11 +288,6 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static int tcp_v4_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -320,7 +315,6 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) static const struct net_offload tcpv4_offload = { .callbacks = { - .gso_send_check = tcp_v4_gso_send_check, .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2918cc914824..19ebe6a39ddc 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,11 +25,6 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -static int udp4_ufo_send_check(struct sk_buff *skb) -{ - return 0; -} - struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features) { @@ -346,7 +341,6 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9952f3fce30a..9034f76ae013 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -53,31 +53,6 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) return proto; } -static int ipv6_gso_send_check(struct sk_buff *skb) -{ - const struct ipv6hdr *ipv6h; - const struct net_offload *ops; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet6_offloads[ - ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); - - if (likely(ops && ops->callbacks.gso_send_check)) { - skb_reset_transport_header(skb); - err = ops->callbacks.gso_send_check(skb); - } - -out: - return err; -} - static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -306,7 +281,6 @@ out_unlock: static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, @@ -315,7 +289,6 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 96253154db3a..c1ab77105b4c 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,11 +15,6 @@ #include #include "ip6_offload.h" -static int tcp_v6_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -71,7 +66,6 @@ struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, } static const struct net_offload tcpv6_offload = { .callbacks = { - .gso_send_check = tcp_v6_gso_send_check, .gso_segment = tcp6_gso_segment, .gro_receive = tcp6_gro_receive, .gro_complete = tcp6_gro_complete, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e4af6437ea3b..212ebfc7973f 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,11 +17,6 @@ #include #include "ip6_offload.h" -static int udp6_ufo_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -166,7 +161,6 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_send_check = udp6_ufo_send_check, .gso_segment = udp6_ufo_fragment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 6b38d083e1c9..e28ed2ef5b06 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -65,15 +65,9 @@ out: return segs; } -static int mpls_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct packet_offload mpls_mc_offload = { .type = cpu_to_be16(ETH_P_MPLS_MC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; @@ -81,7 +75,6 @@ static struct packet_offload mpls_mc_offload = { static struct packet_offload mpls_uc_offload = { .type = cpu_to_be16(ETH_P_MPLS_UC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; -- cgit v1.2.3 From 7276ca3fa23864133f5ee7431c51546d9b7f695f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Sep 2014 13:28:16 +0200 Subject: netfilter: bridge: nf_bridge_copy_header as static inline in header Move nf_bridge_copy_header() as static inline in netfilter_bridge.h header file. This patch prepares the modularization of the br_netfilter code. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 48 +++++++++++++++++++++++++++++++--------- net/bridge/br_netfilter.c | 28 ----------------------- 2 files changed, 38 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 8ab1c278b66d..fe996d59de64 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -24,16 +24,6 @@ enum nf_br_hook_priorities { #define BRNF_8021Q 0x10 #define BRNF_PPPoE 0x20 -/* Only used in br_forward.c */ -int nf_bridge_copy_header(struct sk_buff *skb); -static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) -{ - if (skb->nf_bridge && - skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) - return nf_bridge_copy_header(skb); - return 0; -} - static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { @@ -46,6 +36,44 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) } } +static inline void nf_bridge_update_protocol(struct sk_buff *skb) +{ + if (skb->nf_bridge->mask & BRNF_8021Q) + skb->protocol = htons(ETH_P_8021Q); + else if (skb->nf_bridge->mask & BRNF_PPPoE) + skb->protocol = htons(ETH_P_PPP_SES); +} + +/* Fill in the header for fragmented IP packets handled by + * the IPv4 connection tracking code. + * + * Only used in br_forward.c + */ +static inline int nf_bridge_copy_header(struct sk_buff *skb) +{ + int err; + unsigned int header_size; + + nf_bridge_update_protocol(skb); + header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + err = skb_cow_head(skb, header_size); + if (err) + return err; + + skb_copy_to_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); + __skb_push(skb, nf_bridge_encap_header_len(skb)); + return 0; +} + +static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) +{ + if (skb->nf_bridge && + skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) + return nf_bridge_copy_header(skb); + return 0; +} + static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE)) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a615264cf01a..61929a7cd815 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -245,14 +245,6 @@ static inline void nf_bridge_save_header(struct sk_buff *skb) skb->nf_bridge->data, header_size); } -static inline void nf_bridge_update_protocol(struct sk_buff *skb) -{ - if (skb->nf_bridge->mask & BRNF_8021Q) - skb->protocol = htons(ETH_P_8021Q); - else if (skb->nf_bridge->mask & BRNF_PPPoE) - skb->protocol = htons(ETH_P_PPP_SES); -} - /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format @@ -320,26 +312,6 @@ drop: return -1; } -/* Fill in the header for fragmented IP packets handled by - * the IPv4 connection tracking code. - */ -int nf_bridge_copy_header(struct sk_buff *skb) -{ - int err; - unsigned int header_size; - - nf_bridge_update_protocol(skb); - header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); - err = skb_cow_head(skb, header_size); - if (err) - return err; - - skb_copy_to_linear_data_offset(skb, -header_size, - skb->nf_bridge->data, header_size); - __skb_push(skb, nf_bridge_encap_header_len(skb)); - return 0; -} - /* PF_BRIDGE/PRE_ROUTING *********************************************/ /* Undo the changes made for ip6tables PREROUTING and continue the * bridge PRE_ROUTING hook. */ -- cgit v1.2.3 From 34666d467cbf1e2e3c7bb15a63eccfb582cdd71f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Sep 2014 11:29:03 +0200 Subject: netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- include/linux/netfilter_bridge.h | 2 +- include/linux/skbuff.h | 12 ++--- include/net/neighbour.h | 2 +- include/net/netfilter/ipv4/nf_reject.h | 2 +- include/net/netfilter/ipv6/nf_reject.h | 2 +- net/Kconfig | 7 +-- net/bridge/Makefile | 5 +- net/bridge/br.c | 14 ++--- net/bridge/br_device.c | 4 +- net/bridge/br_forward.c | 2 + net/bridge/br_input.c | 1 + net/bridge/br_netfilter.c | 88 ++++++------------------------- net/bridge/br_netlink.c | 2 +- net/bridge/br_nf_core.c | 96 ++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 12 ++--- net/bridge/br_sysfs_br.c | 4 +- 16 files changed, 151 insertions(+), 104 deletions(-) create mode 100644 net/bridge/br_nf_core.c (limited to 'include') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index fe996d59de64..c755e4971fa3 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -15,7 +15,7 @@ enum nf_br_hook_priorities { NF_BR_PRI_LAST = INT_MAX, }; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #define BRNF_PKT_TYPE 0x01 #define BRNF_BRIDGED_DNAT 0x02 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..c4ff43f84573 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -156,7 +156,7 @@ struct nf_conntrack { }; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { atomic_t use; unsigned int mask; @@ -560,7 +560,7 @@ struct sk_buff { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info *nf_bridge; #endif @@ -2977,7 +2977,7 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { if (nf_bridge && atomic_dec_and_test(&nf_bridge->use)) @@ -2995,7 +2995,7 @@ static inline void nf_reset(struct sk_buff *skb) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); skb->nf_bridge = NULL; #endif @@ -3016,7 +3016,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif @@ -3030,7 +3030,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 47f425464f84..f60558d0254c 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -373,7 +373,7 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) return 0; } -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 931fbf812171..f713b5a31d62 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -98,7 +98,7 @@ static void nf_send_reset(struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 710d17ed70b4..7a10cfcd8e33 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -147,7 +147,7 @@ static void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/net/Kconfig b/net/Kconfig index 4051fdfa4367..dc5d700d05e7 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -176,10 +176,11 @@ config NETFILTER_ADVANCED If unsure, say Y. config BRIDGE_NETFILTER - bool "Bridged IP/ARP packets filtering" - depends on BRIDGE && NETFILTER && INET + tristate "Bridged IP/ARP packets filtering" + depends on (BRIDGE || BRIDGE=n) + depends on NETFILTER && INET depends on NETFILTER_ADVANCED - default y + default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 8590b942bffa..5e3eac5dc8b9 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,11 +6,12 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o br_netlink.o + br_stp_if.o br_stp_timer.o br_netlink.o \ + br_nf_core.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o -bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o +obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 1a755a1e5410..44425aff7cba 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -161,7 +161,7 @@ static int __init br_init(void) if (err) goto err_out1; - err = br_netfilter_init(); + err = br_nf_core_init(); if (err) goto err_out2; @@ -179,11 +179,16 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif + pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " + "deprecated. Update your scripts to load br_netfilter if you " + "need this.\n"); + return 0; + err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: - br_netfilter_fini(); + br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: @@ -196,20 +201,17 @@ err_out: static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); - br_netlink_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); - unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ - br_netfilter_fini(); + br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif - br_fdb_fini(); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 568cccd39a3d..659cac15c0df 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,7 +36,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) u16 vid = 0; rcu_read_lock(); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); rcu_read_unlock(); @@ -167,7 +167,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); #endif diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 056b67b0e277..992ec49a96aa 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -49,6 +49,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) return 0; } +EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct sk_buff *skb) { @@ -56,6 +57,7 @@ int br_forward_finish(struct sk_buff *skb) br_dev_queue_push_xmit); } +EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 366c43649079..6fd5522df696 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -140,6 +140,7 @@ drop: kfree_skb(skb); goto out; } +EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 61929a7cd815..97e43937aaca 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -111,66 +111,6 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) -{ -} - -static void fake_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - return NULL; -} - -static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return NULL; -} - -static unsigned int fake_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - -static struct dst_ops fake_dst_ops = { - .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), - .update_pmtu = fake_update_pmtu, - .redirect = fake_redirect, - .cow_metrics = fake_cow_metrics, - .neigh_lookup = fake_neigh_lookup, - .mtu = fake_mtu, -}; - -/* - * Initialize bogus route table used to keep netfilter happy. - * Currently, we fill in the PMTU entry because netfilter - * refragmentation needs it, and the rt_flags entry because - * ipt_REJECT needs it. Future netfilter modules might - * require us to fill additional fields. - */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - -void br_netfilter_rtable_init(struct net_bridge *br) -{ - struct rtable *rt = &br->fake_rtable; - - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; - rt->dst.ops = &fake_dst_ops; -} - static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) { struct net_bridge_port *port; @@ -1031,38 +971,42 @@ static struct ctl_table brnf_table[] = { }; #endif -int __init br_netfilter_init(void) +static int __init br_netfilter_init(void) { int ret; - ret = dst_entries_init(&fake_dst_ops); + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret < 0) return ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) { - dst_entries_destroy(&fake_dst_ops); - return ret; - } #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - dst_entries_destroy(&fake_dst_ops); - return -ENOMEM; + ret = -ENOMEM; + goto err1; } #endif printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; +err1: + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return ret; } -void br_netfilter_fini(void) +static void __exit br_netfilter_fini(void) { nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif - dst_entries_destroy(&fake_dst_ops); } + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 90a91e137acc..0fa66b83685f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -602,7 +602,7 @@ out_af: return err; } -void __exit br_netlink_fini(void) +void br_netlink_fini(void) { br_mdb_uninit(); rtnl_af_unregister(&br_af_ops); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c new file mode 100644 index 000000000000..387cb3bd017c --- /dev/null +++ b/net/bridge/br_nf_core.c @@ -0,0 +1,96 @@ +/* + * Handle firewalling core + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include + +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void fake_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} + +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +static unsigned int fake_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops fake_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .update_pmtu = fake_update_pmtu, + .redirect = fake_redirect, + .cow_metrics = fake_cow_metrics, + .neigh_lookup = fake_neigh_lookup, + .mtu = fake_mtu, +}; + +/* + * Initialize bogus route table used to keep netfilter happy. + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. + */ +static const u32 br_dst_default_metrics[RTAX_MAX] = { + [RTAX_MTU - 1] = 1500, +}; + +void br_netfilter_rtable_init(struct net_bridge *br) +{ + struct rtable *rt = &br->fake_rtable; + + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; + rt->dst.ops = &fake_dst_ops; +} + +int __init br_nf_core_init(void) +{ + return dst_entries_init(&fake_dst_ops); +} + +void br_nf_core_fini(void) +{ + dst_entries_destroy(&fake_dst_ops); +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 62a7fa2e3569..d304d752c091 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -221,7 +221,7 @@ struct net_bridge struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct rtable fake_rtable; bool nf_call_iptables; bool nf_call_ip6tables; @@ -751,13 +751,13 @@ static inline int br_vlan_enabled(struct net_bridge *br) #endif /* br_netfilter.c */ -#ifdef CONFIG_BRIDGE_NETFILTER -int br_netfilter_init(void); -void br_netfilter_fini(void); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +int br_nf_core_init(void); +void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else -#define br_netfilter_init() (0) -#define br_netfilter_fini() do { } while (0) +static inline int br_nf_core_init(void) { return 0; } +static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c9e2572b15f4..cb431c6016ee 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -629,7 +629,7 @@ static ssize_t multicast_startup_query_interval_store( } static DEVICE_ATTR_RW(multicast_startup_query_interval); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { @@ -763,7 +763,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, -- cgit v1.2.3 From 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:57 -0700 Subject: bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 39 +++++++++ include/linux/bpf.h | 41 +++++++++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 169 ++++++++++++++++++++++++++++++++++++ 5 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf.h create mode 100644 kernel/bpf/syscall.c (limited to 'include') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 014e0319a5c4..4a01d71785e9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF maps +--------- +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: +- create a map with given type and attributes + map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + . type + . max number of elements + . key size in bytes + . value size in bytes + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h new file mode 100644 index 000000000000..48014a71f0fe --- /dev/null +++ b/include/linux/bpf.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _LINUX_BPF_H +#define _LINUX_BPF_H 1 + +#include +#include + +struct bpf_map; + +/* map is generic key/value storage optionally accesible by eBPF programs */ +struct bpf_map_ops { + /* funcs callable from userspace (via syscall) */ + struct bpf_map *(*map_alloc)(union bpf_attr *attr); + void (*map_free)(struct bpf_map *); +}; + +struct bpf_map { + atomic_t refcnt; + enum bpf_map_type map_type; + u32 key_size; + u32 value_size; + u32 max_entries; + struct bpf_map_ops *ops; + struct work_struct work; +}; + +struct bpf_map_type_list { + struct list_head list_node; + struct bpf_map_ops *ops; + enum bpf_map_type type; +}; + +void bpf_register_map_type(struct bpf_map_type_list *tl); +void bpf_map_put(struct bpf_map *map); + +#endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 479ed0b6be16..f58a10f9670c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -62,4 +62,27 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; +/* BPF syscall commands */ +enum bpf_cmd { + /* create a map with given type and attributes + * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) + * returns fd or negative error + * map is deleted when fd is closed + */ + BPF_MAP_CREATE, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, +}; + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + }; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o +obj-y := core.o syscall.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} -- cgit v1.2.3 From 749730ce42a2121e1c88350d69478bff3994b10a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:58 -0700 Subject: bpf: enable bpf syscall on x64 and i386 done as separate commit to ease conflict resolution Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 3 ++- include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 3 +++ 5 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 028b78168d85..9fe1b5d002f0 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -363,3 +363,4 @@ 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 35dd922727b9..281150b539a2 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -327,6 +327,7 @@ 318 common getrandom sys_getrandom 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0f86d85a9ce4..bda9b81357cc 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -65,6 +65,7 @@ struct old_linux_dirent; struct perf_event_attr; struct file_handle; struct sigaltstack; +union bpf_attr; #include #include @@ -875,5 +876,5 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, const char __user *uargs); asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); - +asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 11d11bc5c78f..22749c134117 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp) __SYSCALL(__NR_getrandom, sys_getrandom) #define __NR_memfd_create 279 __SYSCALL(__NR_memfd_create, sys_memfd_create) +#define __NR_bpf 280 +__SYSCALL(__NR_bpf, sys_bpf) #undef __NR_syscalls -#define __NR_syscalls 280 +#define __NR_syscalls 281 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..b4b5083f5f5e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -218,3 +218,6 @@ cond_syscall(sys_kcmp); /* operate on Secure Computing state */ cond_syscall(sys_seccomp); + +/* access BPF programs and maps */ +cond_syscall(sys_bpf); -- cgit v1.2.3 From db20fd2b01087bdfbe30bce314a198eefedcc42e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:59 -0700 Subject: bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 ++ include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/syscall.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48014a71f0fe..2887f3f9da59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -9,6 +9,7 @@ #include #include +#include struct bpf_map; @@ -17,6 +18,12 @@ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_free)(struct bpf_map *); + int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + + /* funcs callable from userspace and from eBPF programs */ + void *(*map_lookup_elem)(struct bpf_map *map, void *key); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_delete_elem)(struct bpf_map *map, void *key); }; struct bpf_map { @@ -37,5 +44,6 @@ struct bpf_map_type_list { void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); +struct bpf_map *bpf_map_get(struct fd f); #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f58a10f9670c..395cabd2ca0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -70,6 +70,35 @@ enum bpf_cmd { * map is deleted when fd is closed */ BPF_MAP_CREATE, + + /* lookup key in a given map + * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero and stores found elem into value + * or negative error + */ + BPF_MAP_LOOKUP_ELEM, + + /* create or update key/value pair in a given map + * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero or negative error + */ + BPF_MAP_UPDATE_ELEM, + + /* find and delete elem by key in a given map + * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key + * returns zero or negative error + */ + BPF_MAP_DELETE_ELEM, + + /* lookup key in a given map and return next key + * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->next_key + * returns zero and stores next key or negative error + */ + BPF_MAP_GET_NEXT_KEY, }; enum bpf_map_type { @@ -83,6 +112,15 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + }; } __attribute__((aligned(8))); #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 428a0e23adc0..f94349ecaf61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -13,6 +13,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -111,6 +112,228 @@ free_map: return err; } +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ESRCH; + rcu_read_lock(); + value = map->ops->map_lookup_elem(map, key); + if (!value) + goto err_unlock; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto err_unlock; + + err = 0; + +err_unlock: + rcu_read_unlock(); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_CREATE: err = map_create(&attr); break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:00 -0700 Subject: bpf: expand BPF syscall with program load/unload eBPF programs are similar to kernel modules. They are loaded by the user process and automatically unloaded when process exits. Each eBPF program is a safe run-to-completion set of instructions. eBPF verifier statically determines that the program terminates and is safe to execute. The following syscall wrapper can be used to load the program: int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } where 'insns' is an array of eBPF instructions and 'license' is a string that must be GPL compatible to call helper functions marked gpl_only Upon succesful load the syscall returns prog_fd. Use close(prog_fd) to unload the program. User space tests and examples follow in the later patches Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 38 +++++++++++ include/linux/filter.h | 8 +-- include/uapi/linux/bpf.h | 26 ++++++++ kernel/bpf/core.c | 29 +++++---- kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2887f3f9da59..92979182be81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL + * instructions after verifying + */ +struct bpf_func_proto { + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + bool gpl_only; +}; + +struct bpf_verifier_ops { + /* return eBPF function prototype for verification */ + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +}; + +struct bpf_prog_type_list { + struct list_head list_node; + struct bpf_verifier_ops *ops; + enum bpf_prog_type type; +}; + +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + +struct bpf_prog; + +struct bpf_prog_aux { + atomic_t refcnt; + bool is_gpl_compatible; + enum bpf_prog_type prog_type; + struct bpf_verifier_ops *ops; + struct bpf_map **used_maps; + u32 used_map_cnt; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_prog_put(struct bpf_prog *prog); +struct bpf_prog *bpf_prog_get(u32 ufd); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a0bc6d134d7..4ffc0958d85e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,6 +21,7 @@ struct sk_buff; struct sock; struct seccomp_data; +struct bpf_prog_aux; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -300,17 +301,12 @@ struct bpf_binary_header { u8 image[]; }; -struct bpf_work_struct { - struct bpf_prog *prog; - struct work_struct work; -}; - struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct bpf_work_struct *work; /* Deferred free work struct */ + struct bpf_prog_aux *aux; /* Auxiliary fields */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395cabd2ca0a..424f442016e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,12 +99,23 @@ enum bpf_cmd { * returns zero and stores next key or negative error */ BPF_MAP_GET_NEXT_KEY, + + /* verify and load eBPF program + * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) + * Using attr->prog_type, attr->insns, attr->license + * returns fd or negative error + */ + BPF_PROG_LOAD, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, }; +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -121,6 +132,21 @@ union bpf_attr { __aligned_u64 next_key; }; }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + }; } __attribute__((aligned(8))); +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +enum bpf_func_id { + BPF_FUNC_unspec, + __BPF_FUNC_MAX_ID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b7002488251..f0c30c59b317 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); @@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); - if (ws == NULL) { + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { vfree(fp); return NULL; } fp->pages = size / PAGE_SIZE; - fp->work = ws; + fp->aux = aux; return fp; } @@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; - /* We keep fp->work from fp_old around in the new + /* We keep fp->aux from fp_old around in the new * reallocated structure. */ - fp_old->work = NULL; + fp_old->aux = NULL; __bpf_prog_free(fp_old); } @@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->work); + kfree(fp->aux); vfree(fp); } EXPORT_SYMBOL_GPL(__bpf_prog_free); @@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static void bpf_prog_free_deferred(struct work_struct *work) { - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; - ws = container_of(work, struct bpf_work_struct, work); - bpf_jit_free(ws->prog); + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); } /* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - struct bpf_work_struct *ws = fp->work; + struct bpf_prog_aux *aux = fp->aux; - INIT_WORK(&ws->work, bpf_prog_free_deferred); - ws->prog = fp; - schedule_work(&ws->work); + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f94349ecaf61..0afb4eaa1887 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static LIST_HEAD(bpf_map_types); @@ -334,6 +336,166 @@ err_put: return err; } +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD license + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + /* err = bpf_check(prog, tb); */ + + if (err < 0) + goto free_used_maps; + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 51580e798cb61b0fc63fa3aa6c5c975375aa0550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:02 -0700 Subject: bpf: verifier (add docs) this patch adds all of eBPF verfier documentation and empty bpf_check() The end goal for the verifier is to statically check safety of the program. Verifier will catch: - loops - out of range jumps - unreachable instructions - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention More details in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 224 ++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 + kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 133 +++++++++++++++++++++ 5 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/verifier.c (limited to 'include') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 4a01d71785e9..5ce4d07406a5 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF verifier +------------- +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable: + bpf_mov R0 = R2 + bpf_exit +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked. +For example: + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn: + bpf_ld R0 = *(u32 *)(R6 + 8) +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=FRAME_PTR, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. +Classic BPF verifier does similar check with M[0-15] memory slots. +For example: + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit +is invalid program. +Though R10 is correct read-only register and has type FRAME_PTR +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + eBPF maps --------- 'maps' is a generic storage of different types for sharing data between kernel @@ -1040,6 +1133,137 @@ The map is defined by: . key size in bytes . value size in bytes +Understanding eBPF verifier messages +------------------------------------ + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions: +static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), +}; +Error: + unreachable insn 1 + +Program that reads uninitialized register: + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), +Error: + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds: + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 92979182be81..9dfeb36f8971 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -83,5 +83,7 @@ struct bpf_prog_aux { void bpf_prog_put(struct bpf_prog *prog); struct bpf_prog *bpf_prog_get(u32 ufd); +/* verify correctness of eBPF program */ +int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e9f7334ed07a..3c726b0995b7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o syscall.o +obj-y := core.o syscall.o verifier.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b513659d120f..74b3628c5fdb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -507,7 +507,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - /* err = bpf_check(prog, tb); */ + err = bpf_check(prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..d6f9c3d6b4d7 --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,133 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +{ + int ret = -EINVAL; + + return ret; +} -- cgit v1.2.3 From cbd357008604925355ae7b54a09137dabb81b580 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:03 -0700 Subject: bpf: verifier (add ability to receive verification log) add optional attributes for BPF_PROG_LOAD syscall: union bpf_attr { struct { ... __u32 log_level; /* verbosity level of eBPF verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied 'char *buffer' */ }; }; when log_level > 0 the verifier will return its verification log in the user supplied buffer 'log_buf' which can be used by program author to analyze why verifier rejected given program. 'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt provides several examples of these messages, like the program: BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), will be rejected with the following multi-line message in log_buf: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 0 4: (85) call 1 5: (15) if r0 == 0x0 goto pc+1 R0=map_ptr R10=fp 6: (7a) *(u64 *)(r0 +4) = 0 misaligned access off 4 size 8 The format of the output can change at any time as verifier evolves. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 + kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 424f442016e7..31b0ac208a52 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -138,6 +138,9 @@ union bpf_attr { __u32 insn_cnt; __aligned_u64 insns; __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 74b3628c5fdb..ba61c8c16032 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD license +#define BPF_PROG_LOAD_LAST_FIELD log_buf static int bpf_prog_load(union bpf_attr *attr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6f9c3d6b4d7..871edc1f2e1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,9 +125,244 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { + char __user *log_ubuf = NULL; + struct verifier_env *env; int ret = -EINVAL; + if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + /* ret = do_check(env); */ + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + kfree(env); + mutex_unlock(&bpf_verifier_lock); return ret; } -- cgit v1.2.3 From 0246e64d9a5fcd4805198de59b9b5cf1f974eb41 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:04 -0700 Subject: bpf: handle pseudo BPF_LD_IMM64 insn eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions to refer to process-local map_fd. Scan the program for such instructions and if FDs are valid, convert them to 'struct bpf_map' pointers which will be used by verifier to check access to maps in bpf_map_lookup/update() calls. If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping BPF_PSEUDO_MAP_FD flag. Note that eBPF interpreter is generic and knows nothing about pseudo insns. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++ kernel/bpf/verifier.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4ffc0958d85e..ca95abd2bed1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -145,6 +145,12 @@ struct bpf_prog_aux; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) +#define BPF_PSEUDO_MAP_FD 1 + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 871edc1f2e1f..7227543e474b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,10 +125,15 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + /* single container for all structs * one verifier_env per bpf_check() call */ struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ }; /* verbose verifier prints what it's seeing @@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; @@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; + env->prog = prog; + /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) log_level = 0; } + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + /* ret = do_check(env); */ +skip_full_check: + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) goto free_log_buf; } + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } free_log_buf: if (log_level) vfree(log_buf); free_env: + if (!prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; -- cgit v1.2.3 From 17a5267067f3c372fec9ffb798d6eaba6b5e6a4c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:06 -0700 Subject: bpf: verifier (add verifier core) This patch adds verifier core which simulates execution of every insn and records the state of registers and program stack. Every branch instruction seen during simulation is pushed into state stack. When verifier reaches BPF_EXIT, it pops the state from the stack and continues until it reaches BPF_EXIT again. For program: 1: bpf_mov r1, xxx 2: if (r1 == 0) goto 5 3: bpf_mov r0, 1 4: goto 6 5: bpf_mov r0, 2 6: bpf_exit The verifier will walk insns: 1, 2, 3, 4, 6 then it will pop the state recorded at insn#2 and will continue: 5, 6 This way it walks all possible paths through the program and checks all possible values of registers. While doing so, it checks for: - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention - instruction encoding is not using reserved fields Kernel subsystem configures the verifier with two callbacks: - bool (*is_valid_access)(int off, int size, enum bpf_access_type type); that provides information to the verifer which fields of 'ctx' are accessible (remember 'ctx' is the first argument to eBPF program) - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); returns argument constraints of kernel helper functions that eBPF program may call, so that verifier can checks that R1-R5 types match the prototype More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 47 +++ kernel/bpf/verifier.c | 1075 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 1121 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dfeb36f8971..3cf91754a957 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,31 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* function argument constraints */ +enum bpf_arg_type { + ARG_ANYTHING = 0, /* any argument is ok */ + + /* the following constraints used to prototype + * bpf_map_lookup/update/delete_elem() functions + */ + ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ + ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ + ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + + /* the following constraints used to prototype bpf_memcmp() and other + * functions that access data on eBPF program stack + */ + ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ + ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ +}; + +/* type of values returned from helper functions */ +enum bpf_return_type { + RET_INTEGER, /* function returns integer */ + RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ +}; + /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying @@ -53,11 +78,33 @@ struct bpf_map *bpf_map_get(struct fd f); struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; + enum bpf_return_type ret_type; + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; +}; + +/* bpf_context is intentionally undefined structure. Pointer to bpf_context is + * the first argument to eBPF programs. + * For socket filters: 'struct bpf_context *' == 'struct sk_buff *' + */ +struct bpf_context; + +enum bpf_access_type { + BPF_READ = 1, + BPF_WRITE = 2 }; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + + /* return true if 'size' wide access at offset 'off' within bpf_context + * with 'type' (read or write) is allowed + */ + bool (*is_valid_access)(int off, int size, enum bpf_access_type type); }; struct bpf_prog_type_list { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c689ab8e2713..a086dd3210a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,6 +125,70 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ +}; + +struct reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK */ + int imm; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* 1st byte of register spilled into stack */ + STACK_SPILL_PART, /* other 7 bytes of register spill */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +struct bpf_stack_slot { + enum bpf_stack_slot_type stype; + struct reg_state reg_st; +}; + +/* state of the program: + * type of all registers and stack info + */ +struct verifier_state { + struct reg_state regs[MAX_BPF_REG]; + struct bpf_stack_slot stack[MAX_BPF_STACK]; +}; + +/* linked list of verifier states used to prune search */ +struct verifier_state_list { + struct verifier_state state; + struct verifier_state_list *next; +}; + +/* verifier_state + insn_idx are pushed to stack when branch is encountered */ +struct verifier_stack_elem { + /* verifer state is 'st' + * before processing instruction 'insn_idx' + * and after processing instruction 'prev_insn_idx' + */ + struct verifier_state st; + int insn_idx; + int prev_insn_idx; + struct verifier_stack_elem *next; +}; + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ /* single container for all structs @@ -132,6 +196,9 @@ */ struct verifier_env { struct bpf_prog *prog; /* eBPF program being verified */ + struct verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct verifier_state cur_state; /* current verifier state */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ }; @@ -160,6 +227,45 @@ static void verbose(const char *fmt, ...) va_end(args); } +/* string representation of 'enum bpf_reg_type' */ +static const char * const reg_type_str[] = { + [NOT_INIT] = "?", + [UNKNOWN_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [FRAME_PTR] = "fp", + [PTR_TO_STACK] = "fp", + [CONST_IMM] = "imm", +}; + +static void print_verifier_state(struct verifier_env *env) +{ + enum bpf_reg_type t; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + t = env->cur_state.regs[i].type; + if (t == NOT_INIT) + continue; + verbose(" R%d=%s", i, reg_type_str[t]); + if (t == CONST_IMM || t == PTR_TO_STACK) + verbose("%d", env->cur_state.regs[i].imm); + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose("(ks=%d,vs=%d)", + env->cur_state.regs[i].map_ptr->key_size, + env->cur_state.regs[i].map_ptr->value_size); + } + for (i = 0; i < MAX_BPF_STACK; i++) { + if (env->cur_state.stack[i].stype == STACK_SPILL) + verbose(" fp%d=%s", -MAX_BPF_STACK + i, + reg_type_str[env->cur_state.stack[i].reg_st.type]); + } + verbose("\n"); +} + static const char *const bpf_class_string[] = { [BPF_LD] = "ld", [BPF_LDX] = "ldx", @@ -305,6 +411,735 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +{ + struct verifier_stack_elem *elem; + int insn_idx; + + if (env->head == NULL) + return -1; + + memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); + insn_idx = env->head->insn_idx; + if (prev_insn_idx) + *prev_insn_idx = env->head->prev_insn_idx; + elem = env->head->next; + kfree(env->head); + env->head = elem; + env->stack_size--; + return insn_idx; +} + +static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, + int prev_insn_idx) +{ + struct verifier_stack_elem *elem; + + elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + env->head = elem; + env->stack_size++; + if (env->stack_size > 1024) { + verbose("BPF program is too complex\n"); + goto err; + } + return &elem->st; +err: + /* pop all elements and return */ + while (pop_stack(env, NULL) >= 0); + return NULL; +} + +#define CALLER_SAVED_REGS 6 +static const int caller_saved[CALLER_SAVED_REGS] = { + BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 +}; + +static void init_reg_state(struct reg_state *regs) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + regs[i].type = NOT_INIT; + regs[i].imm = 0; + regs[i].map_ptr = NULL; + } + + /* frame pointer */ + regs[BPF_REG_FP].type = FRAME_PTR; + + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; +} + +static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].imm = 0; + regs[regno].map_ptr = NULL; +} + +enum reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +static int check_reg_arg(struct reg_state *regs, u32 regno, + enum reg_arg_type t) +{ + if (regno >= MAX_BPF_REG) { + verbose("R%d is invalid\n", regno); + return -EINVAL; + } + + if (t == SRC_OP) { + /* check whether register used as source operand can be read */ + if (regs[regno].type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + } else { + /* check whether register used as dest operand can be written to */ + if (regno == BPF_REG_FP) { + verbose("frame pointer is read only\n"); + return -EACCES; + } + if (t == DST_OP) + mark_reg_unknown_value(regs, regno); + } + return 0; +} + +static int bpf_size_to_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 8; + else + return -EINVAL; +} + +/* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) +{ + struct bpf_stack_slot *slot; + int i; + + if (value_regno >= 0 && + (state->regs[value_regno].type == PTR_TO_MAP_VALUE || + state->regs[value_regno].type == PTR_TO_STACK || + state->regs[value_regno].type == PTR_TO_CTX)) { + + /* register containing pointer is being spilled into stack */ + if (size != 8) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + + slot = &state->stack[MAX_BPF_STACK + off]; + slot->stype = STACK_SPILL; + /* save register state */ + slot->reg_st = state->regs[value_regno]; + for (i = 1; i < 8; i++) { + slot = &state->stack[MAX_BPF_STACK + off + i]; + slot->stype = STACK_SPILL_PART; + slot->reg_st.type = UNKNOWN_VALUE; + slot->reg_st.map_ptr = NULL; + } + } else { + + /* regular write of data into stack */ + for (i = 0; i < size; i++) { + slot = &state->stack[MAX_BPF_STACK + off + i]; + slot->stype = STACK_MISC; + slot->reg_st.type = UNKNOWN_VALUE; + slot->reg_st.map_ptr = NULL; + } + } + return 0; +} + +static int check_stack_read(struct verifier_state *state, int off, int size, + int value_regno) +{ + int i; + struct bpf_stack_slot *slot; + + slot = &state->stack[MAX_BPF_STACK + off]; + + if (slot->stype == STACK_SPILL) { + if (size != 8) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + for (i = 1; i < 8; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != + STACK_SPILL_PART) { + verbose("corrupted spill memory\n"); + return -EACCES; + } + } + + if (value_regno >= 0) + /* restore register state from stack */ + state->regs[value_regno] = slot->reg_st; + return 0; + } else { + for (i = 0; i < size; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != + STACK_MISC) { + verbose("invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + } + if (value_regno >= 0) + /* have read misc data from the stack */ + mark_reg_unknown_value(state->regs, value_regno); + return 0; + } +} + +/* check read/write into map element returned by bpf_map_lookup_elem() */ +static int check_map_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + + if (off < 0 || off + size > map->value_size) { + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + return 0; +} + +/* check access to 'struct bpf_context' fields */ +static int check_ctx_access(struct verifier_env *env, int off, int size, + enum bpf_access_type t) +{ + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t)) + return 0; + + verbose("invalid bpf_context access off=%d size=%d\n", off, size); + return -EACCES; +} + +/* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +static int check_mem_access(struct verifier_env *env, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) +{ + struct verifier_state *state = &env->cur_state; + int size, err = 0; + + size = bpf_size_to_bytes(bpf_size); + if (size < 0) + return size; + + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } + + if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + err = check_map_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == PTR_TO_CTX) { + err = check_ctx_access(env, off, size, t); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == FRAME_PTR) { + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + if (t == BPF_WRITE) + err = check_stack_write(state, off, size, value_regno); + else + err = check_stack_read(state, off, size, value_regno); + } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[state->regs[regno].type]); + return -EACCES; + } + return err; +} + +static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || + insn->imm != 0) { + verbose("BPF_XADD uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); +} + +/* when register 'regno' is passed into function that will read 'access_size' + * bytes from that pointer, make sure that it's within stack boundary + * and all elements of stack are initialized + */ +static int check_stack_boundary(struct verifier_env *env, + int regno, int access_size) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs; + int off, i; + + if (regs[regno].type != PTR_TO_STACK) + return -EACCES; + + off = regs[regno].imm; + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size <= 0) { + verbose("invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + return -EACCES; + } + + for (i = 0; i < access_size; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { + verbose("invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + } + } + return 0; +} + +static int check_func_arg(struct verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, struct bpf_map **mapp) +{ + struct reg_state *reg = env->cur_state.regs + regno; + enum bpf_reg_type expected_type; + int err = 0; + + if (arg_type == ARG_ANYTHING) + return 0; + + if (reg->type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + arg_type == ARG_PTR_TO_MAP_VALUE) { + expected_type = PTR_TO_STACK; + } else if (arg_type == ARG_CONST_STACK_SIZE) { + expected_type = CONST_IMM; + } else if (arg_type == ARG_CONST_MAP_PTR) { + expected_type = CONST_PTR_TO_MAP; + } else { + verbose("unsupported arg_type %d\n", arg_type); + return -EFAULT; + } + + if (reg->type != expected_type) { + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], reg_type_str[expected_type]); + return -EACCES; + } + + if (arg_type == ARG_CONST_MAP_PTR) { + /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + *mapp = reg->map_ptr; + + } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + /* bpf_map_xxx(..., map_ptr, ..., key) call: + * check that [key, key + map->key_size) are within + * stack limits and initialized + */ + if (!*mapp) { + /* in function declaration map_ptr must come before + * map_key, so that it's verified and known before + * we have to check map_key here. Otherwise it means + * that kernel subsystem misconfigured verifier + */ + verbose("invalid map_ptr to access map->key\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->key_size); + + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ + if (!*mapp) { + /* kernel subsystem misconfigured verifier */ + verbose("invalid map_ptr to access map->value\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->value_size); + + } else if (arg_type == ARG_CONST_STACK_SIZE) { + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf + */ + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno - 1, reg->imm); + } + + return err; +} + +static int check_call(struct verifier_env *env, int func_id) +{ + struct verifier_state *state = &env->cur_state; + const struct bpf_func_proto *fn = NULL; + struct reg_state *regs = state->regs; + struct bpf_map *map = NULL; + struct reg_state *reg; + int i, err; + + /* find function prototype */ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + verbose("invalid func %d\n", func_id); + return -EINVAL; + } + + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); + + if (!fn) { + verbose("unknown func %d\n", func_id); + return -EINVAL; + } + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + verbose("cannot call GPL only function from proprietary program\n"); + return -EINVAL; + } + + /* check args */ + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + if (err) + return err; + + /* reset caller saved regs */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* update return register */ + if (fn->ret_type == RET_INTEGER) { + regs[BPF_REG_0].type = UNKNOWN_VALUE; + } else if (fn->ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + /* remember map_ptr, so that check_map_access() + * can check 'value_size' boundary of memory access + * to map element returned from bpf_map_lookup_elem() + */ + if (map == NULL) { + verbose("kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = map; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); + return -EINVAL; + } + return 0; +} + +/* check validity of 32-bit and 64-bit arithmetic operations */ +static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +{ + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode == BPF_END || opcode == BPF_NEG) { + if (opcode == BPF_NEG) { + if (BPF_SRC(insn->code) != 0 || + insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose("BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + verbose("BPF_END uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + } else if (opcode == BPF_MOV) { + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (BPF_SRC(insn->code) == BPF_X) { + if (BPF_CLASS(insn->code) == BPF_ALU64) { + /* case: R1 = R2 + * copy register state to dest reg + */ + regs[insn->dst_reg] = regs[insn->src_reg]; + } else { + regs[insn->dst_reg].type = UNKNOWN_VALUE; + regs[insn->dst_reg].map_ptr = NULL; + } + } else { + /* case: R = imm + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + + } else if (opcode > BPF_END) { + verbose("invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + + } else { /* all other ALU ops: and, sub, xor, add, ... */ + + bool stack_relative = false; + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + if ((opcode == BPF_MOD || opcode == BPF_DIV) && + BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { + verbose("div by zero\n"); + return -EINVAL; + } + + /* pattern match 'bpf_add Rx, imm' instruction */ + if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + regs[insn->dst_reg].type == FRAME_PTR && + BPF_SRC(insn->code) == BPF_K) + stack_relative = true; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (stack_relative) { + regs[insn->dst_reg].type = PTR_TO_STACK; + regs[insn->dst_reg].imm = insn->imm; + } + } + + return 0; +} + +static int check_cond_jmp_op(struct verifier_env *env, + struct bpf_insn *insn, int *insn_idx) +{ + struct reg_state *regs = env->cur_state.regs; + struct verifier_state *other_branch; + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode > BPF_EXIT) { + verbose("invalid BPF_JMP opcode %x\n", opcode); + return -EINVAL; + } + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* detect if R == 0 where R was initialized to zero earlier */ + if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE) && + regs[insn->dst_reg].type == CONST_IMM && + regs[insn->dst_reg].imm == insn->imm) { + if (opcode == BPF_JEQ) { + /* if (imm == imm) goto pc+off; + * only follow the goto, ignore fall-through + */ + *insn_idx += insn->off; + return 0; + } else { + /* if (imm != imm) goto pc+off; + * only follow fall-through branch, since + * that's where the program will go + */ + return 0; + } + } + + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + if (!other_branch) + return -EFAULT; + + /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || + opcode == BPF_JNE) && + regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + if (opcode == BPF_JEQ) { + /* next fallthrough insn can access memory via + * this register + */ + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + /* branch targer cannot access it, since reg == 0 */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = 0; + } else { + other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = 0; + } + } else if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE)) { + + if (opcode == BPF_JEQ) { + /* detect if (R == imm) goto + * and in the target state recognize that R = imm + */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = insn->imm; + } else { + /* detect if (R != imm) goto + * and in the fall-through state recognize that R = imm + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + } + if (log_level) + print_verifier_state(env); + return 0; +} + /* return the map pointer stored inside BPF_LD_IMM64 instruction */ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) { @@ -313,6 +1148,37 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) return (struct bpf_map *) (unsigned long) imm64; } +/* verify BPF_LD_IMM64 instruction */ +static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if (BPF_SIZE(insn->code) != BPF_DW) { + verbose("invalid BPF_LD_IMM insn\n"); + return -EINVAL; + } + if (insn->off != 0) { + verbose("BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (insn->src_reg == 0) + /* generic move 64-bit immediate into a register */ + return 0; + + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -498,6 +1364,212 @@ err_free: return ret; } +static int do_check(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct bpf_insn *insns = env->prog->insnsi; + struct reg_state *regs = state->regs; + int insn_cnt = env->prog->len; + int insn_idx, prev_insn_idx = 0; + int insn_processed = 0; + bool do_print_state = false; + + init_reg_state(regs); + insn_idx = 0; + for (;;) { + struct bpf_insn *insn; + u8 class; + int err; + + if (insn_idx >= insn_cnt) { + verbose("invalid insn idx %d insn_cnt %d\n", + insn_idx, insn_cnt); + return -EFAULT; + } + + insn = &insns[insn_idx]; + class = BPF_CLASS(insn->code); + + if (++insn_processed > 32768) { + verbose("BPF program is too large. Proccessed %d insn\n", + insn_processed); + return -E2BIG; + } + + if (log_level && do_print_state) { + verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + print_verifier_state(env); + do_print_state = false; + } + + if (log_level) { + verbose("%d: ", insn_idx); + print_bpf_insn(insn); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(regs, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ + err = check_mem_access(env, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) + return err; + + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_XADD) { + err = check_xadd(env, insn); + if (err) + return err; + insn_idx++; + continue; + } + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) + return err; + + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose("BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) + return err; + + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + insn->off != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + err = check_call(env, insn->imm); + if (err) + return err; + + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + insn_idx += insn->off + 1; + continue; + + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + if (err) + return err; + + insn_idx = pop_stack(env, &prev_insn_idx); + if (insn_idx < 0) { + break; + } else { + do_print_state = true; + continue; + } + } else { + err = check_cond_jmp_op(env, insn, &insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + verbose("LD_ABS is not supported yet\n"); + return -EINVAL; + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + insn_idx++; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose("unknown insn class %d\n", class); + return -EINVAL; + } + + insn_idx++; + } + + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -651,9 +1723,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - /* ret = do_check(env); */ + ret = do_check(env); skip_full_check: + while (pop_stack(env, NULL) >= 0); if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); -- cgit v1.2.3 From 6ea754eb761d9e7a8ac6fa462b05f9e4cf04fb6c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 22 Sep 2014 11:10:50 -0700 Subject: net: Change netdev_ logging functions to return void No caller or macro uses the return value so make all the functions return void. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/linux/netdevice.h | 19 +++++++++---------- net/core/dev.c | 44 +++++++++++++++++--------------------------- 2 files changed, 26 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f5d293a0281..9b7fbacb6296 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3612,22 +3612,22 @@ static inline const char *netdev_reg_state(const struct net_device *dev) } __printf(3, 4) -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); __printf(2, 3) -int netdev_emerg(const struct net_device *dev, const char *format, ...); +void netdev_emerg(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_alert(const struct net_device *dev, const char *format, ...); +void netdev_alert(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_crit(const struct net_device *dev, const char *format, ...); +void netdev_crit(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_err(const struct net_device *dev, const char *format, ...); +void netdev_err(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_warn(const struct net_device *dev, const char *format, ...); +void netdev_warn(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_notice(const struct net_device *dev, const char *format, ...); +void netdev_notice(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_info(const struct net_device *dev, const char *format, ...); +void netdev_info(const struct net_device *dev, const char *format, ...); #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) @@ -3645,7 +3645,6 @@ do { \ ({ \ if (0) \ netdev_printk(KERN_DEBUG, __dev, format, ##args); \ - 0; \ }) #endif diff --git a/net/core/dev.c b/net/core/dev.c index e2ced01c01e4..e55c546717d4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7066,53 +7066,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s%s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), netdev_reg_state(dev), - vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s%s: %pV", level, netdev_name(dev), - netdev_reg_state(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -7121,11 +7113,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); -- cgit v1.2.3 From f4a775d14489a801a5b8b0540e23ab82e2703091 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 16:29:32 -0700 Subject: net: introduce __skb_header_release() While profiling TCP stack, I noticed one useless atomic operation in tcp_sendmsg(), caused by skb_header_release(). It turns out all current skb_header_release() users have a fresh skb, that no other user can see, so we can avoid one atomic operation. Introduce __skb_header_release() to clearly document this. This gave me a 1.5 % improvement on TCP_RR workload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ net/core/skbuff.c | 4 ++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_output.c | 10 +++++----- 4 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1bfa3781c75..8eaa62400fca 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1083,6 +1083,7 @@ static inline int skb_header_cloned(const struct sk_buff *skb) * Drop a reference to the header part of the buffer. This is done * by acquiring a payload reference. You must not read from the header * part of skb->data after this. + * Note : Check if you can use __skb_header_release() instead. */ static inline void skb_header_release(struct sk_buff *skb) { @@ -1091,6 +1092,20 @@ static inline void skb_header_release(struct sk_buff *skb) atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); } +/** + * __skb_header_release - release reference to header + * @skb: buffer to operate on + * + * Variant of skb_header_release() assuming skb is private to caller. + * We can avoid one atomic operation. + */ +static inline void __skb_header_release(struct sk_buff *skb) +{ + skb->nohdr = 1; + atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); +} + + /** * skb_shared - is the buffer shared * @skb: buffer to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06a8feb10099..512dc7dcbc32 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3179,7 +3179,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; - skb_header_release(p); + __skb_header_release(p); NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; @@ -3211,7 +3211,7 @@ merge: else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + __skb_header_release(skb); lp = p; done: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 070aeff1b131..553b01f52f71 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -609,7 +609,7 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +static void skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -618,7 +618,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c61a7c0c889..f173b1c4f815 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -995,7 +995,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -1167,7 +1167,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, } /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1671,7 +1671,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_set_skb_tso_segs(sk, buff, mss_now); /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -2772,7 +2772,7 @@ int tcp_send_synack(struct sock *sk) if (nskb == NULL) return -ENOMEM; tcp_unlink_write_queue(skb, sk); - skb_header_release(nskb); + __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); sk_wmem_free_skb(sk, skb); sk->sk_wmem_queued += nskb->truesize; @@ -2947,7 +2947,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; - skb_header_release(skb); + __skb_header_release(skb); __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); -- cgit v1.2.3 From 4565af0d406bed44bc8756230eae8f7caa5e0334 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 23 Sep 2014 10:54:32 +0200 Subject: net: optimise csum_replace4() csum_partial() is a generic function which is not optimised for small fixed length calculations, and its use requires to store "from" and "to" values in memory while we already have them available in registers. This also has impact, especially on RISC processors. In the same spirit as the change done by Eric Dumazet on csum_replace2(), this patch rewrites inet_proto_csum_replace4() taking into account RFC1624. I spotted during a NATted tcp transfert that csum_partial() is one of top 5 consuming functions (around 8%), and the second user of csum_partial() is inet_proto_csum_replace4(). I have proposed the same modification to inet_proto_csum_replace4() in another patch. Signed-off-by: Christophe Leroy Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/checksum.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 87cb1903640d..6465bae80a4f 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -122,9 +122,7 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { - __be32 diff[] = { ~from, to }; - - *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); + *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), to)); } /* Implements RFC 1624 (Incremental Internet Checksum) -- cgit v1.2.3 From 24a2d43d8886f5a29c3cf108927f630c545a9a38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:55 -0700 Subject: ipv4: rename ip_options_echo to __ip_options_echo() ip_options_echo() assumes struct ip_options is provided in &IPCB(skb)->opt Lets break this assumption, but provide a helper to not change all call points. ip_send_unicast_reply() gets a new struct ip_options pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 15 ++++++++++++--- net/ipv4/ip_options.c | 6 ++---- net/ipv4/ip_output.c | 8 +++++--- net/ipv4/tcp_ipv4.c | 10 ++++++---- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index fcd9068fb8c3..0bb620702929 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) @@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb); void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); + +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt); +static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +{ + return __ip_options_echo(dopt, skb, &IPCB(skb)->opt); +} + void ip_options_fragment(struct sk_buff *skb); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ad382499bace..5b3d91be2db0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt) { - const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); - sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 215af2b155cb..c8fa62476461 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .uc_ttl = -1, }; -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len) { struct ip_options_data replyopts; @@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, struct sock *sk; struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt.opt, skb)) + if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b2e49cb2b61..28ab90382c01 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } -- cgit v1.2.3 From a224772db8420ecb7ce91a9ba5d535ee3a50d982 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:56 -0700 Subject: ipv6: add a struct inet6_skb_parm param to ipv6_opt_accepted() ipv6_opt_accepted() assumes IP6CB(skb) holds the struct inet6_skb_parm that it needs. Lets not assume this, as TCP stack might use a different place. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 ++- net/dccp/ipv6.c | 2 +- net/ipv6/af_inet6.c | 4 ++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 5 +++-- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7e247e9b8765..97f472012438 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb); +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 04cb17d4b0ce..ad2acfe1ca61 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e4865a3ebe1d..34f726f59814 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); - const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c643dc907ce7..9a2838e93cc5 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index de51a88bec6f..9400b4326f22 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ireq->ir_iif = inet6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || + (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { atomic_inc(&skb->users); @@ -1367,7 +1368,7 @@ ipv6_pktoptions: np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb)) { + if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); } else { -- cgit v1.2.3 From 971f10eca186cab238c49daa91f703c5a001b0b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:57 -0700 Subject: tcp: better TCP_SKB_CB layout to reduce cache line misses TCP maintains lists of skb in write queue, and in receive queues (in order and out of order queues) Scanning these lists both in input and output path usually requires access to skb->next, TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq These fields are currently in two different cache lines, meaning we waste lot of memory bandwidth when these queues are big and flows have either packet drops or packet reorders. We can move TCP_SKB_CB(skb)->header at the end of TCP_SKB_CB, because this header is not used in fast path. This allows TCP to search much faster in the skb lists. Even with regular flows, we save one cache line miss in fast path. Thanks to Christoph Paasch for noticing we need to cleanup skb->cb[] (IPCB/IP6CB) before entering IP stack in tx path, and that I forgot IPCB use in tcp_v4_hnd_req() and tcp_v4_save_options(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 12 ++++++------ net/ipv4/tcp_ipv4.c | 19 ++++++++++++------- net/ipv4/tcp_output.c | 5 +++++ net/ipv6/tcp_ipv6.c | 7 +++++++ 4 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a4201ef216e8..4dc6641ee990 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ @@ -720,6 +714,12 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ __u32 ack_seq; /* Sequence number ACK'd */ + union { + struct inet_skb_parm h4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; /* For incoming frames */ }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 28ab90382c01..9ce3eac02957 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -886,18 +886,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action); */ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { - const struct ip_options *opt = &(IPCB(skb)->opt); + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt) { - if (ip_options_echo(&dopt->opt, skb)) { - kfree(dopt); - dopt = NULL; - } + if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; } } return dopt; @@ -1431,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); #endif return sk; } @@ -1636,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb) th = tcp_hdr(skb); iph = ip_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f173b1c4f815..a462fb1db896 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; + + /* Cleanup our debris for IP stacks */ + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9400b4326f22..132bac137aed 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1412,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), + sizeof(struct inet6_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); -- cgit v1.2.3 From cd7d8498c9a5d510c64db38d9f4f4fbc41790f09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Sep 2014 04:11:22 -0700 Subject: tcp: change tcp_skb_pcount() location Our goal is to access no more than one cache line access per skb in a write or receive queue when doing the various walks. After recent TCP_SKB_CB() reorganizations, it is almost done. Last part is tcp_skb_pcount() which currently uses skb_shinfo(skb)->gso_segs, which is a terrible choice, because it needs 3 cache lines in current kernel (skb->head, skb->end, and shinfo->gso_segs are all in 3 different cache lines, far from skb->cb) This very simple patch reuses space currently taken by tcp_tw_isn only in input path, as tcp_skb_pcount is only needed for skb stored in write queue. This considerably speeds up tcp_ack(), granted we avoid shinfo->tx_flags to get SKBTX_ACK_TSTAMP, which seems possible. This also speeds up all sack processing in general. This speeds up tcp_sendmsg() because it no longer has to access/dirty shinfo. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 23 +++++++++++++++++++++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 9 ++++++--- 4 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4dc6641ee990..02a9a2c366bf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -698,7 +698,16 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ + union { + /* Note : tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) + * + * tcp_gso_segs is used in write queue only, + * cf tcp_skb_pcount() + */ + __u32 tcp_tw_isn; + __u32 tcp_gso_segs; + }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ @@ -746,7 +755,17 @@ TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_segs; + return TCP_SKB_CB(skb)->tcp_gso_segs; +} + +static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) +{ + TCP_SKB_CB(skb)->tcp_gso_segs = segs; +} + +static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) +{ + TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff tcp_skb_pcount() > 1. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 553b01f52f71..87289e51be00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -963,7 +963,7 @@ new_segment: skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; @@ -1261,7 +1261,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); from += copy; copied += copy; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3f016a15c5a..2c0af90231cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1295,9 +1295,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; - skb_shinfo(prev)->gso_segs += pcount; - BUG_ON(skb_shinfo(skb)->gso_segs < pcount); - skb_shinfo(skb)->gso_segs -= pcount; + tcp_skb_pcount_add(prev, pcount); + BUG_ON(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK @@ -1310,7 +1310,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (skb_shinfo(skb)->gso_segs <= 1) { + if (tcp_skb_pcount(skb) <= 1) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a462fb1db896..4d92703df4c6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -384,7 +384,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; @@ -972,6 +972,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1019,11 +1022,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; } else { - shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; shinfo->gso_type = sk->sk_gso_type; } -- cgit v1.2.3 From b2f2af21e37f6d12bd735c27da8942331aa9b3d7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 24 Sep 2014 17:05:18 -0700 Subject: net: dsa: allow enabling and disable switch ports Whenever a per-port network device is used/unused, invoke the switch driver port_enable/port_disable callbacks to allow saving as much power as possible by disabling unused parts of the switch (RX/TX logic, memory arrays, PHYs...). We supply a PHY device argument to make sure the switch driver can act on the PHY device if needed (like putting/taking the PHY out of deep low power mode). Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ net/dsa/slave.c | 14 ++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d8054fb4a4df..4f664fe0e42c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -224,6 +224,14 @@ struct dsa_switch_driver { */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); + + /* + * Port enable/disable + */ + int (*port_enable)(struct dsa_switch *ds, int port, + struct phy_device *phy); + void (*port_disable)(struct dsa_switch *ds, int port, + struct phy_device *phy); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4392e983abda..182d30ae6818 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -62,6 +62,7 @@ static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + struct dsa_switch *ds = p->parent; int err; if (!(master->flags & IFF_UP)) @@ -84,11 +85,20 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } + if (ds->drv->port_enable) { + err = ds->drv->port_enable(ds, p->port, p->phy); + if (err) + goto clear_promisc; + } + if (p->phy) phy_start(p->phy); return 0; +clear_promisc: + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(master, 0); clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(master, -1); @@ -103,6 +113,7 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + struct dsa_switch *ds = p->parent; if (p->phy) phy_stop(p->phy); @@ -117,6 +128,9 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); + if (ds->drv->port_disable) + ds->drv->port_disable(ds, p->port, p->phy); + return 0; } -- cgit v1.2.3 From 7905288f093ad924004609bb89a7ce1597892726 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 24 Sep 2014 17:05:21 -0700 Subject: net: dsa: allow switches driver to implement get/set EEE Allow switches driver to query and enable/disable EEE on a per-port basis by implementing the ethtool_{get,set}_eee settings and delegating these operations to the switch driver. set_eee() will need to coordinate with the PHY driver to make sure that EEE is enabled, the link-partner supports it and the auto-negotiation result is satisfactory. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 9 +++++++++ net/dsa/slave.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4f664fe0e42c..58ad8c6492db 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -232,6 +232,15 @@ struct dsa_switch_driver { struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port, struct phy_device *phy); + + /* + * EEE setttings + */ + int (*set_eee)(struct dsa_switch *ds, int port, + struct phy_device *phydev, + struct ethtool_eee *e); + int (*get_eee)(struct dsa_switch *ds, int port, + struct ethtool_eee *e); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 182d30ae6818..36953c84ff2d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -342,6 +342,44 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) return ret; } +static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + if (!ds->drv->set_eee) + return -EOPNOTSUPP; + + ret = ds->drv->set_eee(ds, p->port, p->phy, e); + if (ret) + return ret; + + if (p->phy) + ret = phy_ethtool_set_eee(p->phy, e); + + return ret; +} + +static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + if (!ds->drv->get_eee) + return -EOPNOTSUPP; + + ret = ds->drv->get_eee(ds, p->port, e); + if (ret) + return ret; + + if (p->phy) + ret = phy_ethtool_get_eee(p->phy, e); + + return ret; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -353,6 +391,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_sset_count = dsa_slave_get_sset_count, .set_wol = dsa_slave_set_wol, .get_wol = dsa_slave_get_wol, + .set_eee = dsa_slave_set_eee, + .get_eee = dsa_slave_get_eee, }; static const struct net_device_ops dsa_slave_netdev_ops = { -- cgit v1.2.3 From 18d0264f630e200772bf236ac5747c47e908501e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Sep 2014 10:26:37 -0700 Subject: net_sched: remove the first parameter from tcf_exts_destroy() Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_basic.c | 4 ++-- net/sched/cls_bpf.c | 4 ++-- net/sched/cls_cgroup.c | 6 +++--- net/sched/cls_flow.c | 4 ++-- net/sched/cls_fw.c | 4 ++-- net/sched/cls_route.c | 4 ++-- net/sched/cls_rsvp.h | 4 ++-- net/sched/cls_tcindex.c | 4 ++-- net/sched/cls_u32.c | 4 ++-- 11 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6da46dcf1049..73f9532ee581 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -137,7 +137,7 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr); -void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts); +void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e547efdaba9d..77147c8c4acc 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -496,7 +496,7 @@ out: return skb->len; } -void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 1937298d6775..fe20826a79e7 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -94,7 +94,7 @@ static void basic_delete_filter(struct rcu_head *head) struct tcf_proto *tp = f->tp; tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(tp, &f->ematches); kfree(f); } @@ -161,7 +161,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 4e3f5bfc0b26..4318d067b0a0 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -93,7 +93,7 @@ static int cls_bpf_init(struct tcf_proto *tp) static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { tcf_unbind_filter(tp, &prog->res); - tcf_exts_destroy(tp, &prog->exts); + tcf_exts_destroy(&prog->exts); bpf_prog_destroy(prog->filter); @@ -217,7 +217,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, errout_free: kfree(bpf_ops); errout: - tcf_exts_destroy(tp, &exts); + tcf_exts_destroy(&exts); return ret; } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 15c34d4ccd9e..3409f168225f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -86,7 +86,7 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root) struct cls_cgroup_head, rcu); - tcf_exts_destroy(head->tp, &head->exts); + tcf_exts_destroy(&head->exts); tcf_em_tree_destroy(head->tp, &head->ematches); kfree(head); } @@ -135,7 +135,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); if (err < 0) { - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); goto errout; } @@ -156,7 +156,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (head) { - tcf_exts_destroy(tp, &head->exts); + tcf_exts_destroy(&head->exts); tcf_em_tree_destroy(tp, &head->ematches); RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 95736fa479f3..f18d27f7b5f2 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -354,7 +354,7 @@ static void flow_destroy_filter(struct rcu_head *head) struct flow_filter *f = container_of(head, struct flow_filter, rcu); del_timer_sync(&f->perturb_timer); - tcf_exts_destroy(f->tp, &f->exts); + tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(f->tp, &f->ematches); kfree(f); } @@ -533,7 +533,7 @@ err2: tcf_em_tree_destroy(tp, &t); kfree(fnew); err1: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 2650285620ee..da805aeeb65c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -126,7 +126,7 @@ static void fw_delete_filter(struct rcu_head *head) struct tcf_proto *tp = f->tp; tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); kfree(f); } @@ -223,7 +223,7 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index ba96deacf27c..b665aee661f7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -272,7 +272,7 @@ route4_delete_filter(struct rcu_head *head) struct tcf_proto *tp = f->tp; tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); kfree(f); } @@ -456,7 +456,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index b044c208b133..1c64a09753c4 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -264,7 +264,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); kfree_rcu(f, rcu); } @@ -577,7 +577,7 @@ insert: errout: kfree(f); errout2: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 5054fae33a48..e3c6fa3ea3d2 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -169,7 +169,7 @@ found: rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); - tcf_exts_destroy(tp, &r->exts); + tcf_exts_destroy(&r->exts); if (f) kfree_rcu(f, rcu); return 0; @@ -401,7 +401,7 @@ errout_alloc: kfree(cp->h); errout: kfree(cp); - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ef97a646ee94..4be3ebf46d67 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -359,7 +359,7 @@ static int u32_destroy_key(struct tcf_proto *tp, bool free_pf) { tcf_unbind_filter(tp, &n->res); - tcf_exts_destroy(tp, &n->exts); + tcf_exts_destroy(&n->exts); if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF @@ -606,7 +606,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } -- cgit v1.2.3 From 3d9a0d2f8212879407e58d67f460d8920eb6543d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Sep 2014 23:04:56 -0700 Subject: dql: dql_queued() should write first to reduce bus transactions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While doing high throughput test on a BQL enabled NIC, I found a very high cost in ndo_start_xmit() when accessing BQL data. It turned out the problem was caused by compiler trying to be smart, but involving a bad MESI transaction : 0.05 │ mov 0xc0(%rax),%edi // LOAD dql->num_queued 0.48 │ mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count 58.23 │ add %edx,%edi 0.58 │ cmp %edi,0xc4(%rax) 0.76 │ mov %edi,0xc0(%rax) // STORE dql->num_queued += count 0.72 │ js bd8 I got an incredible 10 % gain [1] by making sure cpu do not attempt to get the cache line in Shared mode, but directly requests for ownership. New code : mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count add %edx,0xc0(%rax) // RMW dql->num_queued += count mov 0xc4(%rax),%ecx // LOAD dql->adj_limit mov 0xc0(%rax),%edx // LOAD dql->num_queued cmp %edx,%ecx The TX completion was running from another cpu, with high interrupts rate. Note that I am using barrier() as a soft hint, as mb() here could be too heavy cost. [1] This was a netperf TCP_STREAM with TSO disabled, but GSO enabled. Signed-off-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/dynamic_queue_limits.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index 5621547d631b..a4be70398ce1 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -73,14 +73,22 @@ static inline void dql_queued(struct dql *dql, unsigned int count) { BUG_ON(count > DQL_MAX_OBJECT); - dql->num_queued += count; dql->last_obj_cnt = count; + + /* We want to force a write first, so that cpu do not attempt + * to get cache line containing last_obj_cnt, num_queued, adj_limit + * in Shared state, but directly does a Request For Ownership + * It is only a hint, we use barrier() only. + */ + barrier(); + + dql->num_queued += count; } /* Returns how many objects can be queued, < 0 indicates over limit. */ static inline int dql_avail(const struct dql *dql) { - return dql->adj_limit - dql->num_queued; + return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued); } /* Record number of completed objects and recalculate the limit. */ -- cgit v1.2.3 From 55d8694fa82c9b5858ae5a78a210353961f908f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:32 +0200 Subject: net: tcp: assign tcp cong_ops when tcp sk is created Split assignment and initialization from one into two functions. This is required by followup patches that add Datacenter TCP (DCTCP) congestion control algorithm - we need to be able to determine if the connection is moderated by DCTCP before the 3WHS has finished. As we walk the available congestion control list during the assignment, we are always guaranteed to have Reno present as it's fixed compiled-in. Therefore, since we're doing the early assignment, we don't have a real use for the Reno alias tcp_init_congestion_ops anymore and can thus remove it. Actual usage of the congestion control operations are being made after the 3WHS has finished, in some cases however we can access get_info() via diag if implemented, therefore we need to zero out the private area for those modules. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 6 ++---- net/ipv4/tcp_cong.c | 46 ++++++++++++++++++++++------------------------ net/ipv4/tcp_minisocks.c | 5 ++--- 4 files changed, 27 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 02a9a2c366bf..f99b0c072ee5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -824,6 +824,7 @@ struct tcp_congestion_ops { int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); +void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(const char *name); @@ -835,7 +836,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name); int tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); -extern struct tcp_congestion_ops tcp_init_congestion_ops; u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87289e51be00..cf5e508e1ef5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk) tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); - icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -3258,8 +3258,6 @@ void __init tcp_init(void) tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_metrics_init(); - - tcp_register_congestion_control(&tcp_reno); - + BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 80248f56c89f..a6c8a5775624 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct sock *sk) +void tcp_assign_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - /* if no choice made yet assign the current value set as default */ - if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - icsk->icsk_ca_ops = ca; - break; - } - - /* fallback to next available */ + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (likely(try_module_get(ca->owner))) { + icsk->icsk_ca_ops = ca; + goto out; } - rcu_read_unlock(); + /* Fallback to next available. The last really + * guaranteed fallback is Reno from this list. + */ } +out: + rcu_read_unlock(); + + /* Clear out private data before diag gets it and + * the ca has not been initialized. + */ + if (ca->get_info) + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); +} + +void tcp_init_congestion_control(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, }; - -/* Initial congestion control used (until SYN) - * really reno under another name so we can tell difference - * during tcp_set_default_congestion_control - */ -struct tcp_congestion_ops tcp_init_congestion_ops = { - .name = "", - .owner = THIS_MODULE, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, -}; -EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a058f411d3a6..47b73506b77e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && - !try_module_get(newicsk->icsk_ca_ops->owner)) - newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + if (!try_module_get(newicsk->icsk_ca_ops->owner)) + tcp_assign_congestion_control(newsk); tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); -- cgit v1.2.3 From 30e502a34b8b21fae2c789da102bd9f6e99fef83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:33 +0200 Subject: net: tcp: add flag for ca to indicate that ECN is required This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 61 +++++++++++++++++++++++++++++++++++++-------------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 25 +++++++++++++++------ 3 files changed, 63 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f99b0c072ee5..a12f145cfbc3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -733,23 +733,6 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a network is - * playing bad games with TOS bits. In order to avoid possible false congestion - * notifications, we disable TCP ECN negociation. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - struct net *net) -{ - const struct tcphdr *th = tcp_hdr(skb); - - if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && - INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) - inet_rsk(req)->ecn_ok = 1; -} - /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ @@ -791,7 +774,10 @@ enum tcp_ca_event { #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 +/* Requires ECN/ECT set on all packets */ +#define TCP_CONG_NEEDS_ECN 0x2 struct tcp_congestion_ops { struct list_head list; @@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +static inline bool tcp_ca_needs_ecn(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; +} + static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static inline void +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5073eefa6fae..fb0fe97e1c54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); + TCP_ECN_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d92703df4c6..20e73271d75c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ @@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk)) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } } static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, + struct sock *sk) { - if (inet_rsk(req)->ecn_ok) + if (inet_rsk(req)->ecn_ok) { th->ece = 1; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else { + } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(tcp_sk(sk), skb); + TCP_ECN_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th); + TCP_ECN_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is -- cgit v1.2.3 From 7354c8c389d18719dd71cc810da70b0921d66694 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:34 +0200 Subject: net: tcp: split ack slow/fast events from cwnd_event The congestion control ops "cwnd_event" currently supports CA_EVENT_FAST_ACK and CA_EVENT_SLOW_ACK events (among others). Both FAST and SLOW_ACK are only used by Westwood congestion control algorithm. This removes both flags from cwnd_event and adds a new in_ack_event callback for this. The goal is to be able to provide more detailed information about ACKs, such as whether ECE flag was set, or whether the ACK resulted in a window update. It is required for DataCenter TCP (DCTCP) congestion control algorithm as it makes a different choice depending on ECE being set or not. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++-- net/ipv4/tcp_input.c | 12 ++++++++++-- net/ipv4/tcp_westwood.c | 28 ++++++++++++++++------------ 3 files changed, 32 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a12f145cfbc3..7ec6a28305c0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,8 +763,10 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_FAST_ACK, /* in sequence ack */ - CA_EVENT_SLOW_ACK, /* other ack */ +}; + +enum tcp_ca_ack_event_flags { + CA_ACK_SLOWPATH = (1 << 0), }; /* @@ -796,6 +798,8 @@ struct tcp_congestion_ops { void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ + void (*in_ack_event)(struct sock *sk, u32 flags); /* new value of cwnd after loss (optional) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb0fe97e1c54..8a38774cc66e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3362,6 +3362,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } +static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->in_ack_event) + icsk->icsk_ca_ops->in_ack_event(sk, flags); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3421,7 +3429,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(sk, CA_EVENT_FAST_ACK); + tcp_in_ack_event(sk, 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { @@ -3439,7 +3447,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + tcp_in_ack_event(sk, CA_ACK_SLOWPATH); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 81911a92356c..bb63fba47d47 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } +static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) +{ + if (ack_flags & CA_ACK_SLOWPATH) { + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + + update_rtt_min(w); + return; + } + + westwood_fast_bw(sk); +} + static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { - case CA_EVENT_FAST_ACK: - westwood_fast_bw(sk); - break; - case CA_EVENT_COMPLETE_CWR: tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; - - case CA_EVENT_SLOW_ACK: - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); - update_rtt_min(w); - break; - default: /* don't care */ break; @@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .cwnd_event = tcp_westwood_event, + .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.2.3 From 9890092e46b2996bb85f7f973e69424cb5c07bc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:35 +0200 Subject: net: tcp: more detailed ACK events and events for CE marked packets DataCenter TCP (DCTCP) determines cwnd growth based on ECN information and ACK properties, e.g. ACK that updates window is treated differently than DUPACK. Also DCTCP needs information whether ACK was delayed ACK. Furthermore, DCTCP also implements a CE state machine that keeps track of CE markings of incoming packets. Therefore, extend the congestion control framework to provide these event types, so that DCTCP can be properly implemented as a normal congestion algorithm module outside of the core stack. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++++++- net/ipv4/tcp_input.c | 22 ++++++++++++++++++---- net/ipv4/tcp_output.c | 4 ++++ 3 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7ec6a28305c0..1f57c5363492 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,10 +763,17 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ + CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ + CA_EVENT_NON_DELAYED_ACK, }; +/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), + CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ + CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ + CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a38774cc66e..fc133178c787 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } - /* fallinto */ + tp->ecn_flags |= TCP_ECN_SEEN; + break; default: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; + break; } } @@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, 0); + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { + u32 ack_ev_flags = CA_ACK_SLOWPATH; + if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; + ack_ev_flags |= CA_ACK_ECE; + } + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_SLOWPATH); + tcp_in_ack_event(sk, ack_ev_flags); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20e73271d75c..124f9e4e4594 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; + tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); + if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; + tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); + /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. -- cgit v1.2.3 From e3118e8359bb7c59555aca60c725106e6d78c5ce Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:36 +0200 Subject: net: tcp: add DCTCP congestion control algorithm This work adds the DataCenter TCP (DCTCP) congestion control algorithm [1], which has been first published at SIGCOMM 2010 [2], resp. follow-up analysis at SIGMETRICS 2011 [3] (and also, more recently as an informational IETF draft available at [4]). DCTCP is an enhancement to the TCP congestion control algorithm for data center networks. Typical data center workloads are i.e. i) partition/aggregate (queries; bursty, delay sensitive), ii) short messages e.g. 50KB-1MB (for coordination and control state; delay sensitive), and iii) large flows e.g. 1MB-100MB (data update; throughput sensitive). DCTCP has therefore been designed for such environments to provide/achieve the following three requirements: * High burst tolerance (incast due to partition/aggregate) * Low latency (short flows, queries) * High throughput (continuous data updates, large file transfers) with commodity, shallow buffered switches The basic idea of its design consists of two fundamentals: i) on the switch side, packets are being marked when its internal queue length > threshold K (K is chosen so that a large enough headroom for marked traffic is still available in the switch queue); ii) the sender/host side maintains a moving average of the fraction of marked packets, so each RTT, F is being updated as follows: F := X / Y, where X is # of marked ACKs, Y is total # of ACKs alpha := (1 - g) * alpha + g * F, where g is a smoothing constant The resulting alpha (iow: probability that switch queue is congested) is then being used in order to adaptively decrease the congestion window W: W := (1 - (alpha / 2)) * W The means for receiving marked packets resp. marking them on switch side in DCTCP is the use of ECN. RFC3168 describes a mechanism for using Explicit Congestion Notification from the switch for early detection of congestion, rather than waiting for segment loss to occur. However, this method only detects the presence of congestion, not the *extent*. In the presence of mild congestion, it reduces the TCP congestion window too aggressively and unnecessarily affects the throughput of long flows [4]. DCTCP, as mentioned, enhances Explicit Congestion Notification (ECN) processing to estimate the fraction of bytes that encounter congestion, rather than simply detecting that some congestion has occurred. DCTCP then scales the TCP congestion window based on this estimate [4], thus it can derive multibit feedback from the information present in the single-bit sequence of marks in its control law. And thus act in *proportion* to the extent of congestion, not its *presence*. Switches therefore set the Congestion Experienced (CE) codepoint in packets when internal queue lengths exceed threshold K. Resulting, DCTCP delivers the same or better throughput than normal TCP, while using 90% less buffer space. It was found in [2] that DCTCP enables the applications to handle 10x the current background traffic, without impacting foreground traffic. Moreover, a 10x increase in foreground traffic did not cause any timeouts, and thus largely eliminates TCP incast collapse problems. The algorithm itself has already seen deployments in large production data centers since then. We did a long-term stress-test and analysis in a data center, short summary of our TCP incast tests with iperf compared to cubic: This test measured DCTCP throughput and latency and compared it with CUBIC throughput and latency for an incast scenario. In this test, 19 senders sent at maximum rate to a single receiver. The receiver simply ran iperf -s. The senders ran iperf -c -t 30. All senders started simultaneously (using local clocks synchronized by ntp). This test was repeated multiple times. Below shows the results from a single test. Other tests are similar. (DCTCP results were extremely consistent, CUBIC results show some variance induced by the TCP timeouts that CUBIC encountered.) For this test, we report statistics on the number of TCP timeouts, flow throughput, and traffic latency. 1) Timeouts (total over all flows, and per flow summaries): CUBIC DCTCP Total 3227 25 Mean 169.842 1.316 Median 183 1 Max 207 5 Min 123 0 Stddev 28.991 1.600 Timeout data is taken by measuring the net change in netstat -s "other TCP timeouts" reported. As a result, the timeout measurements above are not restricted to the test traffic, and we believe that it is likely that all of the "DCTCP timeouts" are actually timeouts for non-test traffic. We report them nevertheless. CUBIC will also include some non-test timeouts, but they are drawfed by bona fide test traffic timeouts for CUBIC. Clearly DCTCP does an excellent job of preventing TCP timeouts. DCTCP reduces timeouts by at least two orders of magnitude and may well have eliminated them in this scenario. 2) Throughput (per flow in Mbps): CUBIC DCTCP Mean 521.684 521.895 Median 464 523 Max 776 527 Min 403 519 Stddev 105.891 2.601 Fairness 0.962 0.999 Throughput data was simply the average throughput for each flow reported by iperf. By avoiding TCP timeouts, DCTCP is able to achieve much better per-flow results. In CUBIC, many flows experience TCP timeouts which makes flow throughput unpredictable and unfair. DCTCP, on the other hand, provides very clean predictable throughput without incurring TCP timeouts. Thus, the standard deviation of CUBIC throughput is dramatically higher than the standard deviation of DCTCP throughput. Mean throughput is nearly identical because even though cubic flows suffer TCP timeouts, other flows will step in and fill the unused bandwidth. Note that this test is something of a best case scenario for incast under CUBIC: it allows other flows to fill in for flows experiencing a timeout. Under situations where the receiver is issuing requests and then waiting for all flows to complete, flows cannot fill in for timed out flows and throughput will drop dramatically. 3) Latency (in ms): CUBIC DCTCP Mean 4.0088 0.04219 Median 4.055 0.0395 Max 4.2 0.085 Min 3.32 0.028 Stddev 0.1666 0.01064 Latency for each protocol was computed by running "ping -i 0.2 " from a single sender to the receiver during the incast test. For DCTCP, "ping -Q 0x6 -i 0.2 " was used to ensure that traffic traversed the DCTCP queue and was not dropped when the queue size was greater than the marking threshold. The summary statistics above are over all ping metrics measured between the single sender, receiver pair. The latency results for this test show a dramatic difference between CUBIC and DCTCP. CUBIC intentionally overflows the switch buffer which incurs the maximum queue latency (more buffer memory will lead to high latency.) DCTCP, on the other hand, deliberately attempts to keep queue occupancy low. The result is a two orders of magnitude reduction of latency with DCTCP - even with a switch with relatively little RAM. Switches with larger amounts of RAM will incur increasing amounts of latency for CUBIC, but not for DCTCP. 4) Convergence and stability test: This test measured the time that DCTCP took to fairly redistribute bandwidth when a new flow commences. It also measured DCTCP's ability to remain stable at a fair bandwidth distribution. DCTCP is compared with CUBIC for this test. At the commencement of this test, a single flow is sending at maximum rate (near 10 Gbps) to a single receiver. One second after that first flow commences, a new flow from a distinct server begins sending to the same receiver as the first flow. After the second flow has sent data for 10 seconds, the second flow is terminated. The first flow sends for an additional second. Ideally, the bandwidth would be evenly shared as soon as the second flow starts, and recover as soon as it stops. The results of this test are shown below. Note that the flow bandwidth for the two flows was measured near the same time, but not simultaneously. DCTCP performs nearly perfectly within the measurement limitations of this test: bandwidth is quickly distributed fairly between the two flows, remains stable throughout the duration of the test, and recovers quickly. CUBIC, in contrast, is slow to divide the bandwidth fairly, and has trouble remaining stable. CUBIC DCTCP Seconds Flow 1 Flow 2 Seconds Flow 1 Flow 2 0 9.93 0 0 9.92 0 0.5 9.87 0 0.5 9.86 0 1 8.73 2.25 1 6.46 4.88 1.5 7.29 2.8 1.5 4.9 4.99 2 6.96 3.1 2 4.92 4.94 2.5 6.67 3.34 2.5 4.93 5 3 6.39 3.57 3 4.92 4.99 3.5 6.24 3.75 3.5 4.94 4.74 4 6 3.94 4 5.34 4.71 4.5 5.88 4.09 4.5 4.99 4.97 5 5.27 4.98 5 4.83 5.01 5.5 4.93 5.04 5.5 4.89 4.99 6 4.9 4.99 6 4.92 5.04 6.5 4.93 5.1 6.5 4.91 4.97 7 4.28 5.8 7 4.97 4.97 7.5 4.62 4.91 7.5 4.99 4.82 8 5.05 4.45 8 5.16 4.76 8.5 5.93 4.09 8.5 4.94 4.98 9 5.73 4.2 9 4.92 5.02 9.5 5.62 4.32 9.5 4.87 5.03 10 6.12 3.2 10 4.91 5.01 10.5 6.91 3.11 10.5 4.87 5.04 11 8.48 0 11 8.49 4.94 11.5 9.87 0 11.5 9.9 0 SYN/ACK ECT test: This test demonstrates the importance of ECT on SYN and SYN-ACK packets by measuring the connection probability in the presence of competing flows for a DCTCP connection attempt *without* ECT in the SYN packet. The test was repeated five times for each number of competing flows. Competing Flows 1 | 2 | 4 | 8 | 16 ------------------------------ Mean Connection Probability 1 | 0.67 | 0.45 | 0.28 | 0 Median Connection Probability 1 | 0.65 | 0.45 | 0.25 | 0 As the number of competing flows moves beyond 1, the connection probability drops rapidly. Enabling DCTCP with this patch requires the following steps: DCTCP must be running both on the sender and receiver side in your data center, i.e.: sysctl -w net.ipv4.tcp_congestion_control=dctcp Also, ECN functionality must be enabled on all switches in your data center for DCTCP to work. The default ECN marking threshold (K) heuristic on the switch for DCTCP is e.g., 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps (K > 1/7 * C * RTT, [4]). In above tests, for each switch port, traffic was segregated into two queues. For any packet with a DSCP of 0x01 - or equivalently a TOS of 0x04 - the packet was placed into the DCTCP queue. All other packets were placed into the default drop-tail queue. For the DCTCP queue, RED/ECN marking was enabled, here, with a marking threshold of 75 KB. More details however, we refer you to the paper [2] under section 3). There are no code changes required to applications running in user space. DCTCP has been implemented in full *isolation* of the rest of the TCP code as its own congestion control module, so that it can run without a need to expose code to the core of the TCP stack, and thus nothing changes for non-DCTCP users. Changes in the CA framework code are minimal, and DCTCP algorithm operates on mechanisms that are already available in most Silicon. The gain (dctcp_shift_g) is currently a fixed constant (1/16) from the paper, but we leave the option that it can be chosen carefully to a different value by the user. In case DCTCP is being used and ECN support on peer site is off, DCTCP falls back after 3WHS to operate in normal TCP Reno mode. ss {-4,-6} -t -i diag interface: ... dctcp wscale:7,7 rto:203 rtt:2.349/0.026 mss:1448 cwnd:2054 ssthresh:1102 ce_state 0 alpha 15 ab_ecn 0 ab_tot 735584 send 10129.2Mbps pacing_rate 20254.1Mbps unacked:1822 retrans:0/15 reordering:101 rcv_space:29200 ... dctcp-reno wscale:7,7 rto:201 rtt:0.711/1.327 ato:40 mss:1448 cwnd:10 ssthresh:1102 fallback_mode send 162.9Mbps pacing_rate 325.5Mbps rcv_rtt:1.5 rcv_space:29200 More information about DCTCP can be found in [1-4]. [1] http://simula.stanford.edu/~alizade/Site/DCTCP.html [2] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf [3] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf [4] http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/dctcp.txt | 43 +++++ include/uapi/linux/inet_diag.h | 13 +- net/ipv4/Kconfig | 26 ++- net/ipv4/Makefile | 1 + net/ipv4/tcp_dctcp.c | 344 +++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_output.c | 1 + 6 files changed, 425 insertions(+), 3 deletions(-) create mode 100644 Documentation/networking/dctcp.txt create mode 100644 net/ipv4/tcp_dctcp.c (limited to 'include') diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt new file mode 100644 index 000000000000..0d5dfbc89ec9 --- /dev/null +++ b/Documentation/networking/dctcp.txt @@ -0,0 +1,43 @@ +DCTCP (DataCenter TCP) +---------------------- + +DCTCP is an enhancement to the TCP congestion control algorithm for data +center networks and leverages Explicit Congestion Notification (ECN) in +the data center network to provide multi-bit feedback to the end hosts. + +To enable it on end hosts: + + sysctl -w net.ipv4.tcp_congestion_control=dctcp + +All switches in the data center network running DCTCP must support ECN +marking and be configured for marking when reaching defined switch buffer +thresholds. The default ECN marking threshold heuristic for DCTCP on +switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, +but might need further careful tweaking. + +For more details, see below documents: + +Paper: + +The algorithm is further described in detail in the following two +SIGCOMM/SIGMETRICS papers: + + i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + "Data Center TCP (DCTCP)", Data Center Networks session + Proc. ACM SIGCOMM, New Delhi, 2010. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 + +ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + "Analysis of DCTCP: Stability, Convergence, and Fairness" + Proc. ACM SIGMETRICS, San Jose, 2011. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + +IETF informational draft: + + http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 + +DCTCP site: + + http://simula.stanford.edu/~alizade/Site/DCTCP.html diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbde90fa5838..d65c0a09efd3 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -110,10 +110,10 @@ enum { INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, + INET_DIAG_DCTCPINFO, }; -#define INET_DIAG_MAX INET_DIAG_SHUTDOWN - +#define INET_DIAG_MAX INET_DIAG_DCTCPINFO /* INET_DIAG_MEM */ @@ -133,5 +133,14 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; +/* INET_DIAG_DCTCPINFO */ + +struct tcp_dctcp_info { + __u16 dctcp_enabled; + __u16 dctcp_ce_state; + __u32 dctcp_alpha; + __u32 dctcp_ab_ecn; + __u32 dctcp_ab_tot; +}; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 84f710b7472a..69fb37854449 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +config TCP_CONG_DCTCP + tristate "DataCenter TCP (DCTCP)" + default n + ---help--- + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: + + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. + + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. + + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -598,9 +619,11 @@ choice config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y + config DEFAULT_DCTCP + bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_RENO bool "Reno" - endchoice endif @@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO + default "dctcp" if DEFAULT_DCTCP default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d78d404c596f..d8105787c199 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c @@ -0,0 +1,344 @@ +/* DataCenter TCP (DCTCP) congestion control. + * + * http://simula.stanford.edu/~alizade/Site/DCTCP.html + * + * This is an implementation of DCTCP over Reno, an enhancement to the + * TCP congestion control algorithm designed for data centers. DCTCP + * leverages Explicit Congestion Notification (ECN) in the network to + * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet + * the following three data center transport requirements: + * + * - High burst tolerance (incast due to partition/aggregate) + * - Low latency (short flows, queries) + * - High throughput (continuous data updates, large file transfers) + * with commodity shallow buffered switches + * + * The algorithm is described in detail in the following two papers: + * + * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + * "Data Center TCP (DCTCP)", Data Center Networks session + * Proc. ACM SIGCOMM, New Delhi, 2010. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + * + * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + * "Analysis of DCTCP: Stability, Convergence, and Fairness" + * Proc. ACM SIGMETRICS, San Jose, 2011. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + * + * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. + * + * Authors: + * + * Daniel Borkmann + * Florian Westphal + * Glenn Judd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include + +#define DCTCP_MAX_ALPHA 1024U + +struct dctcp { + u32 acked_bytes_ecn; + u32 acked_bytes_total; + u32 prior_snd_una; + u32 prior_rcv_nxt; + u32 dctcp_alpha; + u32 next_seq; + u32 ce_state; + u32 delayed_ack_reserved; +}; + +static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ +module_param(dctcp_shift_g, uint, 0644); +MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); + +static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; +module_param(dctcp_alpha_on_init, uint, 0644); +MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); + +static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; +module_param(dctcp_clamp_alpha_on_loss, uint, 0644); +MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, + "parameter for clamping alpha on loss"); + +static struct tcp_congestion_ops dctcp_reno; + +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +{ + ca->next_seq = tp->snd_nxt; + + ca->acked_bytes_ecn = 0; + ca->acked_bytes_total = 0; +} + +static void dctcp_init(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if ((tp->ecn_flags & TCP_ECN_OK) || + (sk->sk_state == TCP_LISTEN || + sk->sk_state == TCP_CLOSE)) { + struct dctcp *ca = inet_csk_ca(sk); + + ca->prior_snd_una = tp->snd_una; + ca->prior_rcv_nxt = tp->rcv_nxt; + + ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); + + ca->delayed_ack_reserved = 0; + ca->ce_state = 0; + + dctcp_reset(tp, ca); + return; + } + + /* No ECN support? Fall back to Reno. Also need to clear + * ECT from sk since it is set during 3WHS for DCTCP. + */ + inet_csk(sk)->icsk_ca_ops = &dctcp_reno; + INET_ECN_dontxmit(sk); +} + +static u32 dctcp_ssthresh(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ + +static void dctcp_ce_state_0_to_1(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ + if (!ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=0. */ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; + + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +} + +static void dctcp_ce_state_1_to_0(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ + if (ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=1. */ + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; + + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static void dctcp_update_alpha(struct sock *sk, u32 flags) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + u32 acked_bytes = tp->snd_una - ca->prior_snd_una; + + /* If ack did not advance snd_una, count dupack as MSS size. + * If ack did update window, do not count it at all. + */ + if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) + acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; + if (acked_bytes) { + ca->acked_bytes_total += acked_bytes; + ca->prior_snd_una = tp->snd_una; + + if (flags & CA_ACK_ECE) + ca->acked_bytes_ecn += acked_bytes; + } + + /* Expired RTT */ + if (!before(tp->snd_una, ca->next_seq)) { + /* For avoiding denominator == 1. */ + if (ca->acked_bytes_total == 0) + ca->acked_bytes_total = 1; + + /* alpha = (1 - g) * alpha + g * F */ + ca->dctcp_alpha = ca->dctcp_alpha - + (ca->dctcp_alpha >> dctcp_shift_g) + + (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / + ca->acked_bytes_total; + + if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) + /* Clamp dctcp_alpha to max. */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + + dctcp_reset(tp, ca); + } +} + +static void dctcp_state(struct sock *sk, u8 new_state) +{ + if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { + struct dctcp *ca = inet_csk_ca(sk); + + /* If this extension is enabled, we clamp dctcp_alpha to + * max on packet loss; the motivation is that dctcp_alpha + * is an indicator to the extend of congestion and packet + * loss is an indicator of extreme congestion; setting + * this in practice turned out to be beneficial, and + * effectively assumes total congestion which reduces the + * window by half. + */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + } +} + +static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) +{ + struct dctcp *ca = inet_csk_ca(sk); + + switch (ev) { + case CA_EVENT_DELAYED_ACK: + if (!ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 1; + break; + case CA_EVENT_NON_DELAYED_ACK: + if (ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 0; + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ + switch (ev) { + case CA_EVENT_ECN_IS_CE: + dctcp_ce_state_0_to_1(sk); + break; + case CA_EVENT_ECN_NO_CE: + dctcp_ce_state_1_to_0(sk); + break; + case CA_EVENT_DELAYED_ACK: + case CA_EVENT_NON_DELAYED_ACK: + dctcp_update_ack_reserved(sk, ev); + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + /* Fill it also in case of VEGASINFO due to req struct limits. + * We can still correctly retrieve it later. + */ + if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_dctcp_info info; + + memset(&info, 0, sizeof(info)); + if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { + info.dctcp_enabled = 1; + info.dctcp_ce_state = (u16) ca->ce_state; + info.dctcp_alpha = ca->dctcp_alpha; + info.dctcp_ab_ecn = ca->acked_bytes_ecn; + info.dctcp_ab_tot = ca->acked_bytes_total; + } + + nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops dctcp __read_mostly = { + .init = dctcp_init, + .in_ack_event = dctcp_update_alpha, + .cwnd_event = dctcp_cwnd_event, + .ssthresh = dctcp_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .set_state = dctcp_state, + .get_info = dctcp_get_info, + .flags = TCP_CONG_NEEDS_ECN, + .owner = THIS_MODULE, + .name = "dctcp", +}; + +static struct tcp_congestion_ops dctcp_reno __read_mostly = { + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .get_info = dctcp_get_info, + .owner = THIS_MODULE, + .name = "dctcp-reno", +}; + +static int __init dctcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&dctcp); +} + +static void __exit dctcp_unregister(void) +{ + tcp_unregister_congestion_control(&dctcp); +} + +module_init(dctcp_register); +module_exit(dctcp_unregister); + +MODULE_AUTHOR("Daniel Borkmann "); +MODULE_AUTHOR("Florian Westphal "); +MODULE_AUTHOR("Glenn Judd "); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 124f9e4e4594..86a0216fcaa1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3211,6 +3211,7 @@ void tcp_send_ack(struct sock *sk) skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. -- cgit v1.2.3 From 9363dc4b599949bde338cdaba1cf7cac243e4e97 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 23 Sep 2014 13:30:41 +0200 Subject: netfilter: nf_tables: store and dump set policy We want to know in which cases the user explicitly sets the policy options. In that case, we also want to dump back the info. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4d86198d3d6..3d7292392fac 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -241,6 +241,7 @@ void nft_unregister_set(struct nft_set_ops *ops); * @dtype: data type (verdict or numeric type defined by userspace) * @size: maximum set size * @nelems: number of elements + * @policy: set parameterization (see enum nft_set_policies) * @ops: set ops * @flags: set flags * @klen: key length @@ -255,6 +256,7 @@ struct nft_set { u32 dtype; u32 size; u32 nelems; + u16 policy; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a476b9962155..19e79f0d9ad2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2344,6 +2344,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->policy != NFT_SET_POL_PERFORMANCE) { + if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) + goto nla_put_failure; + } + desc = nla_nest_start(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; @@ -2669,6 +2674,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; + set->policy = policy; err = ops->init(set, &desc, nla); if (err < 0) -- cgit v1.2.3 From b1937227316417aa7568d01e6fa1f272e98fb890 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2014 22:18:47 -0700 Subject: net: reorganize sk_buff for faster __copy_skb_header() With proliferation of bit fields in sk_buff, __copy_skb_header() became quite expensive, showing as the most expensive function in a GSO workload. __copy_skb_header() performance is also critical for non GSO TCP operations, as it is used from skb_clone() This patch carefully moves all the fields that were not copied in a separate zone : cloned, nohdr, fclone, peeked, head_frag, xmit_more Then I moved all other fields and all other copied fields in a section delimited by headers_start[0]/headers_end[0] section so that we can use a single memcpy() call, inlined by compiler using long word load/stores. I also tried to make all copies in the natural orders of sk_buff, to help hardware prefetching. I made sure sk_buff size did not change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 133 ++++++++++++++++++++++++++----------------------- net/core/skbuff.c | 80 ++++++++++++++--------------- 2 files changed, 113 insertions(+), 100 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8eaa62400fca..b6cced304b26 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -527,27 +527,41 @@ struct sk_buff { char cb[48] __aligned(8); unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_XFRM struct sec_path *sp; +#endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct nf_conntrack *nfct; +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + struct nf_bridge_info *nf_bridge; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; - union { - __wsum csum; - struct { - __u16 csum_start; - __u16 csum_offset; - }; - }; - __u32 priority; + + /* Following fields are _not_ copied in __copy_skb_header() + * Note that queue_mapping is here mostly to fill a hole. + */ kmemcheck_bitfield_begin(flags1); - __u8 ignore_df:1, - cloned:1, - ip_summed:2, + __u16 queue_mapping; + __u8 cloned:1, nohdr:1, - nfctinfo:3; + fclone:2, + peeked:1, + head_frag:1, + xmit_more:1; + /* one bit hole */ + kmemcheck_bitfield_end(flags1); + + + + /* fields enclosed in headers_start/headers_end are copied + * using a single memcpy() in __copy_skb_header() + */ + __u32 headers_start[0]; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD @@ -558,58 +572,53 @@ struct sk_buff { #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) __u8 __pkt_type_offset[0]; - __u8 pkt_type:3, - fclone:2, - ipvs_property:1, - peeked:1, - nf_trace:1; - kmemcheck_bitfield_end(flags1); - __be16 protocol; - - void (*destructor)(struct sk_buff *skb); -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - struct nf_conntrack *nfct; -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - struct nf_bridge_info *nf_bridge; -#endif - - int skb_iif; - - __u32 hash; - - __be16 vlan_proto; - __u16 vlan_tci; - -#ifdef CONFIG_NET_SCHED - __u16 tc_index; /* traffic control index */ -#ifdef CONFIG_NET_CLS_ACT - __u16 tc_verd; /* traffic control verdict */ -#endif -#endif - - __u16 queue_mapping; - kmemcheck_bitfield_begin(flags2); - __u8 xmit_more:1; -#ifdef CONFIG_IPV6_NDISC_NODETYPE - __u8 ndisc_nodetype:2; -#endif + __u8 pkt_type:3; __u8 pfmemalloc:1; + __u8 ignore_df:1; + __u8 nfctinfo:3; + + __u8 nf_trace:1; + __u8 ip_summed:2; __u8 ooo_okay:1; __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; + __u8 no_fcs:1; - __u8 head_frag:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; - /* 1/3 bit hole (depending on ndisc_nodetype presence) */ - kmemcheck_bitfield_end(flags2); + __u8 csum_level:2; + __u8 csum_bad:1; +#ifdef CONFIG_IPV6_NDISC_NODETYPE + __u8 ndisc_nodetype:2; +#endif + __u8 ipvs_property:1; + /* 5 or 7 bit hole */ + +#ifdef CONFIG_NET_SCHED + __u16 tc_index; /* traffic control index */ +#ifdef CONFIG_NET_CLS_ACT + __u16 tc_verd; /* traffic control verdict */ +#endif +#endif + + union { + __wsum csum; + struct { + __u16 csum_start; + __u16 csum_offset; + }; + }; + __u32 priority; + int skb_iif; + __u32 hash; + __be16 vlan_proto; + __u16 vlan_tci; #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL union { unsigned int napi_id; @@ -625,19 +634,18 @@ struct sk_buff { __u32 reserved_tailroom; }; - kmemcheck_bitfield_begin(flags3); - __u8 csum_level:2; - __u8 csum_bad:1; - /* 13 bit hole */ - kmemcheck_bitfield_end(flags3); - __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; + + __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; + + __u32 headers_end[0]; + /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; @@ -3040,19 +3048,22 @@ static inline void nf_reset_trace(struct sk_buff *skb) } /* Note: This doesn't put any conntrack and bridge info in dst. */ -static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) +static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, + bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->nfct = src->nfct; nf_conntrack_get(src->nfct); - dst->nfctinfo = src->nfctinfo; + if (copy) + dst->nfctinfo = src->nfctinfo; #endif #ifdef CONFIG_BRIDGE_NETFILTER dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) - dst->nf_trace = src->nf_trace; + if (copy) + dst->nf_trace = src->nf_trace; #endif } @@ -3064,7 +3075,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(dst->nf_bridge); #endif - __nf_copy(dst, src); + __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d4fdc649112c..4be570a4ab21 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -261,7 +261,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_t *fclone_ref = (atomic_t *) (child + 1); kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); skb->fclone = SKB_FCLONE_ORIG; atomic_set(fclone_ref, 1); @@ -675,57 +674,61 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +/* Make sure a field is enclosed inside headers_start/headers_end section */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ + offsetof(struct sk_buff, headers_start)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ + offsetof(struct sk_buff, headers_end)); \ + static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_protocol = old->inner_protocol; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - skb_copy_hash(new, old); - new->ooo_okay = old->ooo_okay; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; - new->encap_hdr_csum = old->encap_hdr_csum; - new->csum_valid = old->csum_valid; - new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->ignore_df = old->ignore_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; + __nf_copy(new, old, false); + + /* Note : this field could be in headers_start/headers_end section + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers_start, &old->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; + CHECK_SKB_FIELD(tc_index); #ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; + CHECK_SKB_FIELD(tc_verd); #endif #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; - - skb_copy_secmark(new, old); -#ifdef CONFIG_NET_RX_BUSY_POLL - new->napi_id = old->napi_id; -#endif } /* @@ -876,7 +879,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } -- cgit v1.2.3 From 8c14f9c70327a6fb75534c4c61d7ea9c82ccf78f Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Mon, 29 Sep 2014 11:55:36 +0200 Subject: ARCNET: add com20020 PCI IDs with metadata This patch adds metadata for the com20020 to prepare for devices with multiple io address areas with multi card interfaces. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 216 ++++++++++++++++++++++++++++++++------ include/linux/com20020.h | 16 +++ 2 files changed, 197 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 7bb292e59559..f9e55527739d 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -63,6 +63,8 @@ MODULE_LICENSE("GPL"); static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { + struct com20020_pci_channel_map *cm; + struct com20020_pci_card_info *ci; struct net_device *dev; struct arcnet_local *lp; int ioaddr, err; @@ -75,19 +77,15 @@ static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev->netdev_ops = &com20020_netdev_ops; + ci = (struct com20020_pci_card_info *)id->driver_data; + lp = netdev_priv(dev); pci_set_drvdata(pdev, dev); - // SOHARD needs PCI base addr 4 - if (pdev->vendor==0x10B5) { - BUGMSG(D_NORMAL, "SOHARD\n"); - ioaddr = pci_resource_start(pdev, 4); - } - else { - BUGMSG(D_NORMAL, "Contemporary Controls\n"); - ioaddr = pci_resource_start(pdev, 2); - } + cm = &ci->chan_map_tbl[0]; + BUGMSG(D_NORMAL, "%s Controls\n", ci->name); + ioaddr = pci_resource_start(pdev, cm->bar); if (!request_region(ioaddr, ARCNET_TOTAL_SIZE, "com20020-pci")) { BUGMSG(D_INIT, "IO region %xh-%xh already allocated.\n", @@ -105,7 +103,7 @@ static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev->irq = pdev->irq; dev->dev_addr[0] = node; lp->card_name = "PCI COM20020"; - lp->card_flags = id->driver_data; + lp->card_flags = ci->flags; lp->backplane = backplane; lp->clockp = clockp & 7; lp->clockm = clockm & 3; @@ -144,32 +142,180 @@ static void com20020pci_remove(struct pci_dev *pdev) free_netdev(dev); } +static struct com20020_pci_card_info card_info_10mbit = { + .name = "ARC-PCI", + .devcount = 1, + .chan_map_tbl = { + { 2, 0x00, 0x08 }, + }, + .flags = ARC_CAN_10MBIT, +}; + +static struct com20020_pci_card_info card_info_5mbit = { + .name = "ARC-PCI", + .devcount = 1, + .chan_map_tbl = { + { 2, 0x00, 0x08 }, + }, + .flags = ARC_IS_5MBIT, +}; + +static struct com20020_pci_card_info card_info_sohard = { + .name = "PLX-PCI", + .devcount = 1, + /* SOHARD needs PCI base addr 4 */ + .chan_map_tbl = { + {4, 0x00, 0x08}, + }, + .flags = ARC_CAN_10MBIT, +}; + static const struct pci_device_id com20020pci_id_table[] = { - { 0x1571, 0xa001, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa003, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa004, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa005, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa006, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa007, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa008, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa009, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00a, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa201, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa202, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa203, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa204, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa205, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa206, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x10B5, 0x9030, 0x10B5, 0x2978, 0, 0, ARC_CAN_10MBIT }, - { 0x10B5, 0x9050, 0x10B5, 0x2273, 0, 0, ARC_CAN_10MBIT }, - { 0x14BA, 0x6000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x10B5, 0x2200, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - {0,} + { + 0x1571, 0xa001, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0, + }, + { + 0x1571, 0xa002, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0, + }, + { + 0x1571, 0xa003, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa004, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0, + }, + { + 0x1571, 0xa005, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa006, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa007, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa008, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa009, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00a, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00b, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00c, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00d, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00e, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa201, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa202, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa203, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa204, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa205, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa206, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x10B5, 0x9030, + 0x10B5, 0x2978, + 0, 0, + (kernel_ulong_t)&card_info_sohard + }, + { + 0x10B5, 0x9050, + 0x10B5, 0x2273, + 0, 0, + (kernel_ulong_t)&card_info_sohard + }, + { + 0x14BA, 0x6000, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x10B5, 0x2200, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { 0, } }; MODULE_DEVICE_TABLE(pci, com20020pci_id_table); diff --git a/include/linux/com20020.h b/include/linux/com20020.h index 5dcfb944b6ce..6a1ceca61e7f 100644 --- a/include/linux/com20020.h +++ b/include/linux/com20020.h @@ -41,6 +41,22 @@ extern const struct net_device_ops com20020_netdev_ops; #define BUS_ALIGN 1 #endif +#define PLX_PCI_MAX_CARDS 1 + +struct com20020_pci_channel_map { + u32 bar; + u32 offset; + u32 size; /* 0x00 - auto, e.g. length of entire bar */ +}; + +struct com20020_pci_card_info { + const char *name; + int devcount; + + struct com20020_pci_channel_map chan_map_tbl[PLX_PCI_MAX_CARDS]; + + unsigned int flags; +}; #define _INTMASK (ioaddr+BUS_ALIGN*0) /* writable */ #define _STATUS (ioaddr+BUS_ALIGN*0) /* readable */ -- cgit v1.2.3 From c51da42a6346c0c747e70a4f5ae873da1150a784 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Mon, 29 Sep 2014 11:55:37 +0200 Subject: ARCNET: add support for multi interfaces on com20020 The com20020-pci driver is currently designed to instance one netdev with one pci device. This patch adds support to instance many cards with one pci device, depending on the device data in the private data. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 149 ++++++++++++++++++++++++-------------- drivers/net/arcnet/com20020_cs.c | 4 - include/linux/com20020.h | 15 +++- 3 files changed, 109 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index f9e55527739d..fe87576bae3c 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -61,85 +62,125 @@ module_param(clockp, int, 0); module_param(clockm, int, 0); MODULE_LICENSE("GPL"); +static void com20020pci_remove(struct pci_dev *pdev); + static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - struct com20020_pci_channel_map *cm; struct com20020_pci_card_info *ci; struct net_device *dev; struct arcnet_local *lp; - int ioaddr, err; + struct com20020_priv *priv; + int i, ioaddr, ret; + struct resource *r; if (pci_enable_device(pdev)) return -EIO; - dev = alloc_arcdev(device); - if (!dev) - return -ENOMEM; - - dev->netdev_ops = &com20020_netdev_ops; + priv = devm_kzalloc(&pdev->dev, sizeof(struct com20020_priv), + GFP_KERNEL); ci = (struct com20020_pci_card_info *)id->driver_data; + priv->ci = ci; - lp = netdev_priv(dev); + INIT_LIST_HEAD(&priv->list_dev); - pci_set_drvdata(pdev, dev); - cm = &ci->chan_map_tbl[0]; - BUGMSG(D_NORMAL, "%s Controls\n", ci->name); - ioaddr = pci_resource_start(pdev, cm->bar); + for (i = 0; i < ci->devcount; i++) { + struct com20020_pci_channel_map *cm = &ci->chan_map_tbl[i]; + struct com20020_dev *card; - if (!request_region(ioaddr, ARCNET_TOTAL_SIZE, "com20020-pci")) { - BUGMSG(D_INIT, "IO region %xh-%xh already allocated.\n", - ioaddr, ioaddr + ARCNET_TOTAL_SIZE - 1); - err = -EBUSY; - goto out_dev; - } + dev = alloc_arcdev(device); + if (!dev) { + ret = -ENOMEM; + goto out_port; + } - // Dummy access after Reset - // ARCNET controller needs this access to detect bustype - outb(0x00,ioaddr+1); - inb(ioaddr+1); - - dev->base_addr = ioaddr; - dev->irq = pdev->irq; - dev->dev_addr[0] = node; - lp->card_name = "PCI COM20020"; - lp->card_flags = ci->flags; - lp->backplane = backplane; - lp->clockp = clockp & 7; - lp->clockm = clockm & 3; - lp->timeout = timeout; - lp->hw.owner = THIS_MODULE; - - if (ASTATUS() == 0xFF) { - BUGMSG(D_NORMAL, "IO address %Xh was reported by PCI BIOS, " - "but seems empty!\n", ioaddr); - err = -EIO; - goto out_port; - } - if (com20020_check(dev)) { - err = -EIO; - goto out_port; + dev->netdev_ops = &com20020_netdev_ops; + + lp = netdev_priv(dev); + + BUGMSG(D_NORMAL, "%s Controls\n", ci->name); + ioaddr = pci_resource_start(pdev, cm->bar) + cm->offset; + + r = devm_request_region(&pdev->dev, ioaddr, cm->size, + "com20020-pci"); + if (!r) { + pr_err("IO region %xh-%xh already allocated.\n", + ioaddr, ioaddr + cm->size - 1); + ret = -EBUSY; + goto out_port; + } + + /* Dummy access after Reset + * ARCNET controller needs + * this access to detect bustype + */ + outb(0x00, ioaddr + 1); + inb(ioaddr + 1); + + dev->base_addr = ioaddr; + dev->dev_addr[0] = node; + dev->irq = pdev->irq; + lp->card_name = "PCI COM20020"; + lp->card_flags = ci->flags; + lp->backplane = backplane; + lp->clockp = clockp & 7; + lp->clockm = clockm & 3; + lp->timeout = timeout; + lp->hw.owner = THIS_MODULE; + + if (ASTATUS() == 0xFF) { + pr_err("IO address %Xh is empty!\n", ioaddr); + ret = -EIO; + goto out_port; + } + if (com20020_check(dev)) { + ret = -EIO; + goto out_port; + } + + card = devm_kzalloc(&pdev->dev, sizeof(struct com20020_dev), + GFP_KERNEL); + if (!card) { + pr_err("%s out of memory!\n", __func__); + return -ENOMEM; + } + + card->index = i; + card->pci_priv = priv; + card->dev = dev; + + dev_set_drvdata(&dev->dev, card); + + ret = com20020_found(dev, IRQF_SHARED); + if (ret) + goto out_port; + + list_add(&card->list, &priv->list_dev); } - if ((err = com20020_found(dev, IRQF_SHARED)) != 0) - goto out_port; + pci_set_drvdata(pdev, priv); return 0; out_port: - release_region(ioaddr, ARCNET_TOTAL_SIZE); -out_dev: - free_netdev(dev); - return err; + com20020pci_remove(pdev); + return ret; } static void com20020pci_remove(struct pci_dev *pdev) { - struct net_device *dev = pci_get_drvdata(pdev); - unregister_netdev(dev); - free_irq(dev->irq, dev); - release_region(dev->base_addr, ARCNET_TOTAL_SIZE); - free_netdev(dev); + struct com20020_dev *card, *tmpcard; + struct com20020_priv *priv; + + priv = pci_get_drvdata(pdev); + + list_for_each_entry_safe(card, tmpcard, &priv->list_dev, list) { + struct net_device *dev = card->dev; + + unregister_netdev(dev); + free_irq(dev->irq, dev); + free_netdev(dev); + } } static struct com20020_pci_card_info card_info_10mbit = { diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c index 1a790a20210d..057d9582132a 100644 --- a/drivers/net/arcnet/com20020_cs.c +++ b/drivers/net/arcnet/com20020_cs.c @@ -112,10 +112,6 @@ static void com20020_detach(struct pcmcia_device *p_dev); /*====================================================================*/ -struct com20020_dev { - struct net_device *dev; -}; - static int com20020_probe(struct pcmcia_device *p_dev) { struct com20020_dev *info; diff --git a/include/linux/com20020.h b/include/linux/com20020.h index 6a1ceca61e7f..85898995b234 100644 --- a/include/linux/com20020.h +++ b/include/linux/com20020.h @@ -41,7 +41,7 @@ extern const struct net_device_ops com20020_netdev_ops; #define BUS_ALIGN 1 #endif -#define PLX_PCI_MAX_CARDS 1 +#define PLX_PCI_MAX_CARDS 2 struct com20020_pci_channel_map { u32 bar; @@ -58,6 +58,19 @@ struct com20020_pci_card_info { unsigned int flags; }; +struct com20020_priv { + struct com20020_pci_card_info *ci; + struct list_head list_dev; +}; + +struct com20020_dev { + struct list_head list; + struct net_device *dev; + + struct com20020_priv *pci_priv; + int index; +}; + #define _INTMASK (ioaddr+BUS_ALIGN*0) /* writable */ #define _STATUS (ioaddr+BUS_ALIGN*0) /* readable */ #define _COMMAND (ioaddr+BUS_ALIGN*1) /* standard arcnet commands */ -- cgit v1.2.3 From d82bd1229885d550d03926cfa937703f6caa3cc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Sep 2014 13:08:29 +0200 Subject: tcp: move TCP_ECN_create_request out of header After Octavian Purdilas tcp ipv4/ipv6 unification work this helper only has a single callsite. While at it, convert name to lowercase, suggested by Stephen. Suggested-by: Stephen Hemminger Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 34 ---------------------------------- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1f57c5363492..545a79aa4212 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -861,40 +861,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a - * network is playing bad games with TOS bits. In order to - * avoid possible false congestion notifications, we disable - * TCP ECN negociation. - * - * Exception: tcp_ca wants ECN. This is required for DCTCP - * congestion control; it requires setting ECT on all packets, - * including SYN. We inverse the test in this case: If our - * local socket wants ECN, but peer only set ece/cwr (but not - * ECT in IP header) its probably a non-DCTCP aware sender. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) -{ - const struct tcphdr *th = tcp_hdr(skb); - const struct net *net = sock_net(listen_sk); - bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; - - if (!th_ecn) - return; - - ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - need_ecn = tcp_ca_needs_ecn(listen_sk); - - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) - inet_rsk(req)->ecn_ok = 1; - else if (ect && need_ecn) - inet_rsk(req)->ecn_ok = 1; -} - /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fc133178c787..174181e28ef3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5906,6 +5906,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) #endif } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static void tcp_ecn_create_request(struct request_sock *req, + const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -5966,7 +6000,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sk); + tcp_ecn_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); -- cgit v1.2.3 From 79cf79abce71eb7dbc40e2f3121048ca5405cb47 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Thu, 25 Sep 2014 16:31:08 +0200 Subject: macvlan: add source mode This patch adds a new mode of operation to macvlan, called "source". It allows one to set a list of allowed mac address, which is used to match against source mac address from received frames on underlying interface. This enables creating mac based VLAN associations, instead of standard port or tag based. The feature is useful to deploy 802.1x mac based behavior, where drivers of underlying interfaces doesn't allows that. Configuration is done through the netlink interface using e.g.: ip link add link eth0 name macvlan0 type macvlan mode source ip link add link eth0 name macvlan1 type macvlan mode source ip link set link dev macvlan0 type macvlan macaddr add 00:11:11:11:11:11 ip link set link dev macvlan0 type macvlan macaddr add 00:22:22:22:22:22 ip link set link dev macvlan0 type macvlan macaddr add 00:33:33:33:33:33 ip link set link dev macvlan1 type macvlan macaddr add 00:33:33:33:33:33 ip link set link dev macvlan1 type macvlan macaddr add 00:44:44:44:44:44 This allows clients with MAC addresses 00:11:11:11:11:11, 00:22:22:22:22:22 to be part of only VLAN associated with macvlan0 interface. Clients with MAC addresses 00:44:44:44:44:44 with only VLAN associated with macvlan1 interface. And client with MAC address 00:33:33:33:33:33 to be associated with both VLANs. Based on work of Stefan Gula v8: last version of Stefan Gula for Kernel 3.2.1 v9: rework onto linux-next 2014-03-12 by Michael Braun add MACADDR_SET command, enable to configure mac for source mode while creating interface v10: - reduce indention level - rename source_list to source_entry - use aligned 64bit ether address - use hash_64 instead of addr[5] v11: - rebase for 3.14 / linux-next 20.04.2014 v12 - rebase for linux-next 2014-09-25 Signed-off-by: Michael Braun Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 304 ++++++++++++++++++++++++++++++++++++++++++- include/linux/if_macvlan.h | 1 + include/uapi/linux/if_link.h | 12 ++ 3 files changed, 314 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 726edabff26b..e8a453f1b458 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -35,7 +35,8 @@ #include #include -#define MACVLAN_HASH_SIZE (1 << BITS_PER_BYTE) +#define MACVLAN_HASH_BITS 8 +#define MACVLAN_HASH_SIZE (1<>= 16; +#else + value <<= 16; +#endif + return hash_64(value, MACVLAN_HASH_BITS); +} + static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); @@ -73,20 +96,68 @@ static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; + u32 idx = macvlan_eth_hash(addr); - hlist_for_each_entry_rcu(vlan, &port->vlan_hash[addr[5]], hlist) { + hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } +static struct macvlan_source_entry *macvlan_hash_lookup_source( + const struct macvlan_dev *vlan, + const unsigned char *addr) +{ + struct macvlan_source_entry *entry; + u32 idx = macvlan_eth_hash(addr); + struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (ether_addr_equal_64bits(entry->addr, addr) && + entry->vlan == vlan) + return entry; + } + return NULL; +} + +static int macvlan_hash_add_source(struct macvlan_dev *vlan, + const unsigned char *addr) +{ + struct macvlan_port *port = vlan->port; + struct macvlan_source_entry *entry; + struct hlist_head *h; + + entry = macvlan_hash_lookup_source(vlan, addr); + if (entry) + return 0; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + ether_addr_copy(entry->addr, addr); + entry->vlan = vlan; + h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; + hlist_add_head_rcu(&entry->hlist, h); + vlan->macaddr_count++; + + return 0; +} + static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; + u32 idx = macvlan_eth_hash(addr); - hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[addr[5]]); + hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); +} + +static void macvlan_hash_del_source(struct macvlan_source_entry *entry) +{ + hlist_del_rcu(&entry->hlist); + kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) @@ -267,6 +338,65 @@ err: atomic_long_inc(&skb->dev->rx_dropped); } +static void macvlan_flush_sources(struct macvlan_port *port, + struct macvlan_dev *vlan) +{ + int i; + + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { + struct hlist_node *h, *n; + + hlist_for_each_safe(h, n, &port->vlan_source_hash[i]) { + struct macvlan_source_entry *entry; + + entry = hlist_entry(h, struct macvlan_source_entry, + hlist); + if (entry->vlan == vlan) + macvlan_hash_del_source(entry); + } + } + vlan->macaddr_count = 0; +} + +static void macvlan_forward_source_one(struct sk_buff *skb, + struct macvlan_dev *vlan) +{ + struct sk_buff *nskb; + struct net_device *dev; + int len; + int ret; + + dev = vlan->dev; + if (unlikely(!(dev->flags & IFF_UP))) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + len = nskb->len + ETH_HLEN; + nskb->dev = dev; + nskb->pkt_type = PACKET_HOST; + + ret = netif_rx(nskb); + macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0); +} + +static void macvlan_forward_source(struct sk_buff *skb, + struct macvlan_port *port, + const unsigned char *addr) +{ + struct macvlan_source_entry *entry; + u32 idx = macvlan_eth_hash(addr); + struct hlist_head *h = &port->vlan_source_hash[idx]; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (ether_addr_equal_64bits(entry->addr, addr)) + if (entry->vlan->dev->flags & IFF_UP) + macvlan_forward_source_one(skb, entry->vlan); + } +} + /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { @@ -285,6 +415,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) if (!skb) return RX_HANDLER_CONSUMED; eth = eth_hdr(skb); + macvlan_forward_source(skb, port, eth->h_source); src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -301,6 +432,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } + macvlan_forward_source(skb, port, eth->h_source); if (port->passthru) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -667,6 +799,7 @@ static void macvlan_uninit(struct net_device *dev) free_percpu(vlan->pcpu_stats); + macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); @@ -925,6 +1058,8 @@ static int macvlan_port_create(struct net_device *dev) INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); + for (i = 0; i < MACVLAN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&port->vlan_source_hash[i]); skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); @@ -966,11 +1101,102 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[]) case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: + case MACVLAN_MODE_SOURCE: + break; + default: + return -EINVAL; + } + } + + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { + case MACVLAN_MACADDR_ADD: + case MACVLAN_MACADDR_DEL: + case MACVLAN_MACADDR_FLUSH: + case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } + + if (data && data[IFLA_MACVLAN_MACADDR]) { + if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) + return -EINVAL; + + if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) + return -EADDRNOTAVAIL; + } + + if (data && data[IFLA_MACVLAN_MACADDR_COUNT]) + return -EINVAL; + + return 0; +} + +/** + * reconfigure list of remote source mac address + * (only for macvlan devices in source mode) + * Note regarding alignment: all netlink data is aligned to 4 Byte, which + * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. + */ +static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, + struct nlattr *data[]) +{ + char *addr = NULL; + int ret, rem, len; + struct nlattr *nla, *head; + struct macvlan_source_entry *entry; + + if (data[IFLA_MACVLAN_MACADDR]) + addr = nla_data(data[IFLA_MACVLAN_MACADDR]); + + if (mode == MACVLAN_MACADDR_ADD) { + if (!addr) + return -EINVAL; + + return macvlan_hash_add_source(vlan, addr); + + } else if (mode == MACVLAN_MACADDR_DEL) { + if (!addr) + return -EINVAL; + + entry = macvlan_hash_lookup_source(vlan, addr); + if (entry) { + macvlan_hash_del_source(entry); + vlan->macaddr_count--; + } + } else if (mode == MACVLAN_MACADDR_FLUSH) { + macvlan_flush_sources(vlan->port, vlan); + } else if (mode == MACVLAN_MACADDR_SET) { + macvlan_flush_sources(vlan->port, vlan); + + if (addr) { + ret = macvlan_hash_add_source(vlan, addr); + if (ret) + return ret; + } + + if (!data || !data[IFLA_MACVLAN_MACADDR_DATA]) + return 0; + + head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); + len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); + + nla_for_each_attr(nla, head, len, rem) { + if (nla_type(nla) != IFLA_MACVLAN_MACADDR || + nla_len(nla) != ETH_ALEN) + continue; + + addr = nla_data(nla); + ret = macvlan_hash_add_source(vlan, addr); + if (ret) + return ret; + } + } else { + return -EINVAL; + } + return 0; } @@ -981,6 +1207,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct macvlan_port *port; struct net_device *lowerdev; int err; + int macmode; if (!tb[IFLA_LINK]) return -EINVAL; @@ -1034,6 +1261,15 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_inherit(dev, lowerdev); } + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (vlan->mode != MACVLAN_MODE_SOURCE) + return -EINVAL; + macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); + err = macvlan_changelink_sources(vlan, macmode, data); + if (err) + return err; + } + port->count += 1; err = register_netdevice(dev); if (err < 0) @@ -1070,6 +1306,8 @@ void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); + if (vlan->mode == MACVLAN_MODE_SOURCE) + macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); @@ -1082,6 +1320,8 @@ static int macvlan_changelink(struct net_device *dev, struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; + enum macvlan_macaddr_mode macmode; + int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { @@ -1091,6 +1331,9 @@ static int macvlan_changelink(struct net_device *dev, if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; + if (vlan->mode == MACVLAN_MODE_SOURCE && + vlan->mode != mode) + macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { @@ -1110,26 +1353,77 @@ static int macvlan_changelink(struct net_device *dev, } if (set_mode) vlan->mode = mode; + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (vlan->mode != MACVLAN_MODE_SOURCE) + return -EINVAL; + macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); + ret = macvlan_changelink_sources(vlan, macmode, data); + if (ret) + return ret; + } return 0; } +static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) +{ + if (vlan->macaddr_count == 0) + return 0; + return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); +} + static size_t macvlan_get_size(const struct net_device *dev) { + struct macvlan_dev *vlan = netdev_priv(dev); + return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ ); } +static int macvlan_fill_info_macaddr(struct sk_buff *skb, + const struct macvlan_dev *vlan, + const int i) +{ + struct hlist_head *h = &vlan->port->vlan_source_hash[i]; + struct macvlan_source_entry *entry; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (entry->vlan != vlan) + continue; + if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) + return 1; + } + return 0; +} + static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); + int i; + struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; + if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) + goto nla_put_failure; + if (vlan->macaddr_count > 0) { + nest = nla_nest_start(skb, IFLA_MACVLAN_MACADDR_DATA); + if (nest == NULL) + goto nla_put_failure; + + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { + if (macvlan_fill_info_macaddr(skb, vlan, i)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } return 0; nla_put_failure: @@ -1139,6 +1433,10 @@ nla_put_failure: static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, + [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, + [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, + [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 6b2c7cf352a5..6f6929ea8a0c 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -60,6 +60,7 @@ struct macvlan_dev { #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif + unsigned int macaddr_count; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c80f95f6ee78..0bdb77e16875 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -303,6 +303,10 @@ enum { IFLA_MACVLAN_UNSPEC, IFLA_MACVLAN_MODE, IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, __IFLA_MACVLAN_MAX, }; @@ -313,6 +317,14 @@ enum macvlan_mode { MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, }; #define MACVLAN_FLAG_NOPROMISC 1 -- cgit v1.2.3 From 22e0f8b9322cb1a48b1357e8f4ae6f5a9eca8cfa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:52:56 -0700 Subject: net: sched: make bstats per cpu and estimator RCU safe In order to run qdisc's without locking statistics and estimators need to be handled correctly. To resolve bstats make the statistics per cpu. And because this is only needed for qdiscs that are running without locks which is not the case for most qdiscs in the near future only create percpu stats when qdiscs set the TCQ_F_CPUSTATS flag. Next because estimators use the bstats to calculate packets per second and bytes per second the estimator code paths are updated to use the per cpu statistics. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 11 ++++++++++ include/net/sch_generic.h | 22 ++++++++++++++++++- net/core/gen_estimator.c | 29 +++++++++++++++---------- net/core/gen_stats.c | 53 +++++++++++++++++++++++++++++++++++++++++----- net/netfilter/xt_RATEEST.c | 2 +- net/sched/act_api.c | 5 +++-- net/sched/act_police.c | 2 +- net/sched/sch_api.c | 29 +++++++++++++++++++------ net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 7 +++--- net/sched/sch_drr.c | 7 +++--- net/sched/sch_generic.c | 3 +++ net/sched/sch_hfsc.c | 13 +++++++----- net/sched/sch_htb.c | 12 +++++++---- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 4 ++-- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 8 ++++--- 19 files changed, 164 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index ea4271dceff0..ce3c1281f2a0 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -6,6 +6,11 @@ #include #include +struct gnet_stats_basic_cpu { + struct gnet_stats_basic_packed bstats; + struct u64_stats_sync syncp; +}; + struct gnet_dump { spinlock_t * lock; struct sk_buff * skb; @@ -27,7 +32,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d); int gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); +void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, struct gnet_stats_rate_est64 *r); @@ -37,11 +46,13 @@ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est64 *rate_est); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e65b8e0752af..4b9351120fd8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,7 @@ struct Qdisc { * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) +#define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -83,7 +85,10 @@ struct Qdisc { */ unsigned long state; struct sk_buff_head q; - struct gnet_stats_basic_packed bstats; + union { + struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + } __packed; unsigned int __state; struct gnet_stats_queue qstats; struct rcu_head rcu_head; @@ -487,6 +492,10 @@ static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue(skb, sch) & NET_XMIT_MASK; } +static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) +{ + return q->flags & TCQ_F_CPUSTATS; +} static inline void bstats_update(struct gnet_stats_basic_packed *bstats, const struct sk_buff *skb) @@ -495,6 +504,17 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; } +static inline void qdisc_bstats_update_cpu(struct Qdisc *sch, + const struct sk_buff *skb) +{ + struct gnet_stats_basic_cpu *bstats = + this_cpu_ptr(sch->cpu_bstats); + + u64_stats_update_begin(&bstats->syncp); + bstats_update(&bstats->bstats, skb); + u64_stats_update_end(&bstats->syncp); +} + static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9d33dfffca19..9dfb88a933e7 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -91,6 +91,8 @@ struct gen_estimator u32 avpps; struct rcu_head e_rcu; struct rb_node node; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct rcu_head head; }; struct gen_estimator_head @@ -115,9 +117,8 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { - u64 nbytes; + struct gnet_stats_basic_packed b = {0}; u64 brate; - u32 npackets; u32 rate; spin_lock(e->stats_lock); @@ -125,15 +126,15 @@ static void est_timer(unsigned long arg) if (e->bstats == NULL) goto skip; - nbytes = e->bstats->bytes; - npackets = e->bstats->packets; - brate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; + __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + + brate = (b.bytes - e->last_bytes)<<(7 - idx); + e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; + rate = (b.packets - e->last_packets)<<(12 - idx); + e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); e->rate_est->pps = (e->avpps+0x1FF)>>10; skip: @@ -203,12 +204,14 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); + struct gnet_stats_basic_packed b = {0}; int idx; if (nla_len(opt) < sizeof(*parm)) @@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; + __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; est->ewma_log = parm->ewma_log; - est->last_bytes = bstats->bytes; + est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; - est->last_packets = bstats->packets; + est->last_packets = b.packets; est->avpps = rate_est->pps<<10; + est->cpu_bstats = cpu_bstats; spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { @@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 2ddbce4cce14..5ff8e80fe0bb 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); +static void +__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu) +{ + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + __u64 bytes; + __u32 packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = bcpu->bstats.bytes; + packets = bcpu->bstats.packets; + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + bstats->bytes += bcpu->bstats.bytes; + bstats->packets += bcpu->bstats.packets; + } +} + +void +__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + if (cpu) { + __gnet_stats_copy_basic_cpu(bstats, cpu); + } else { + bstats->bytes = b->bytes; + bstats->packets = b->packets; + } +} +EXPORT_SYMBOL(__gnet_stats_copy_basic); + /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle @@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) { + struct gnet_stats_basic_packed bstats = {0}; + + __gnet_stats_copy_basic(&bstats, cpu, b); + if (d->compat_tc_stats) { - d->tc_stats.bytes = b->bytes; - d->tc_stats.packets = b->packets; + d->tc_stats.bytes = bstats.bytes; + d->tc_stats.packets = bstats.packets; } if (d->tail) { struct gnet_stats_basic sb; memset(&sb, 0, sizeof(sb)); - sb.bytes = b->bytes; - sb.packets = b->packets; + sb.bytes = bstats.bytes; + sb.packets = bstats.packets; return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); } return 0; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 370adf622cef..604df6fae6fc 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -136,7 +136,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; - ret = gen_new_estimator(&est->bstats, &est->rstats, + ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, &est->lock, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 648778aef1a2..eca4cf9ece2f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -252,7 +252,8 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est, + int err = gen_new_estimator(&p->tcfc_bstats, NULL, + &p->tcfc_rate_est, &p->tcfc_lock, est); if (err) { kfree(p); @@ -619,7 +620,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index f32bcb094915..69791ca77a05 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -178,7 +178,7 @@ override: spin_lock_bh(&police->tcf_lock); if (est) { - err = gen_replace_estimator(&police->tcf_bstats, + err = gen_replace_estimator(&police->tcf_bstats, NULL, &police->tcf_rate_est, &police->tcf_lock, est); if (err) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 15e7beee266c..a95e3b48fa51 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -942,6 +942,13 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { + if (qdisc_is_percpu_stats(sch)) { + sch->cpu_bstats = + alloc_percpu(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto err_out4; + } + if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -964,8 +971,11 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, else root_lock = qdisc_lock(sch); - err = gen_new_estimator(&sch->bstats, &sch->rate_est, - root_lock, tca[TCA_RATE]); + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + root_lock, + tca[TCA_RATE]); if (err) goto err_out4; } @@ -984,6 +994,7 @@ err_out: return NULL; err_out4: + free_percpu(sch->cpu_bstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. @@ -1022,9 +1033,11 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) because change can't be undone. */ if (sch->flags & TCQ_F_MQROOT) goto out; - gen_replace_estimator(&sch->bstats, &sch->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE]); + gen_replace_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); } out: return 0; @@ -1299,6 +1312,7 @@ graft: static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); @@ -1334,7 +1348,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; - if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || + if (qdisc_is_percpu_stats(q)) + cpu_bstats = q->cpu_bstats; + + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, &q->qstats) < 0) goto nla_put_failure; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c398f9c3dbdd..01017663e5d8 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -639,7 +639,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, flow->qstats.qlen = flow->q->q.qlen; - if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || gnet_stats_copy_queue(d, &flow->qstats) < 0) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index d2cd981ba60d..22a3a029a911 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1601,7 +1601,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1759,7 +1759,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t } if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1852,7 +1853,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t goto failure; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index d8b5ccfd248a..7a6243c5d270 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -88,7 +88,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -116,7 +117,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -282,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; } - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) return -1; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 11b28f651ad1..7c8e5d73d433 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -632,6 +632,9 @@ static void qdisc_rcu_free(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); + if (qdisc_is_percpu_stats(qdisc)) + free_percpu(qdisc->cpu_bstats); + kfree((char *) qdisc - qdisc->padded); } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 04b0de4c68b5..209b966b2eed 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1014,9 +1014,12 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cur_time = psched_get_time(); if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE]); + spinlock_t *lock = qdisc_root_sleeping_lock(sch); + + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + lock, + tca[TCA_RATE]); if (err) return err; } @@ -1063,7 +1066,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1374,7 +1377,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 063e953d9848..0256dee69bd6 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1144,7 +1144,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1402,7 +1402,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; if (htb_rate_est || tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE] ? : &est.nla); if (err) { @@ -1464,8 +1465,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent->children++; } else { if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + spinlock_t *lock = qdisc_root_sleeping_lock(sch); + + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + lock, tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index a8b2864a696b..d3a27fb607af 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -201,7 +201,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; sch->qstats.qlen = sch->q.qlen; - if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, &sch->qstats) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 37e7d25d21f1..8917372fddc6 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -355,7 +355,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, } /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(d, &bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || gnet_stats_copy_queue(d, &qstats) < 0) return -1; } else { @@ -363,7 +363,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; sch->qstats.qlen = sch->q.qlen; - if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, &sch->qstats) < 0) return -1; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index c0466c1840f3..4adbf7fefc09 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -361,7 +361,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; cl_q->qstats.qlen = cl_q->q.qlen; - if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, &cl_q->qstats) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 03ef99e52a5c..68a8f25e30c3 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -325,7 +325,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; cl_q->qstats.qlen = cl_q->q.qlen; - if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, &cl_q->qstats) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 602ea01a4ddd..d59f8574540a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -459,7 +459,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -484,7 +485,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -667,7 +669,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) return -1; -- cgit v1.2.3 From 25331d6ce42bcf4b34b6705fce4da15c3fabe62f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:53:29 -0700 Subject: net: sched: implement qstat helper routines This adds helpers to manipulate qstats logic and replaces locations that touch the counters directly. This simplifies future patches to push qstats onto per cpu counters. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 39 +++++++++++++++++++++++++++++++++------ net/sched/sch_api.c | 2 +- net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 10 +++++----- net/sched/sch_choke.c | 14 +++++++------- net/sched/sch_codel.c | 2 +- net/sched/sch_drr.c | 4 ++-- net/sched/sch_dsmark.c | 2 +- net/sched/sch_fifo.c | 2 +- net/sched/sch_fq.c | 4 ++-- net/sched/sch_fq_codel.c | 8 ++++---- net/sched/sch_gred.c | 4 ++-- net/sched/sch_hfsc.c | 8 ++++---- net/sched/sch_hhf.c | 8 ++++---- net/sched/sch_htb.c | 6 +++--- net/sched/sch_ingress.c | 2 +- net/sched/sch_multiq.c | 4 ++-- net/sched/sch_netem.c | 15 +++++++-------- net/sched/sch_pie.c | 2 +- net/sched/sch_prio.c | 4 ++-- net/sched/sch_qfq.c | 4 ++-- net/sched/sch_red.c | 8 ++++---- net/sched/sch_sfb.c | 10 +++++----- net/sched/sch_sfq.c | 17 +++++++++-------- net/sched/sch_tbf.c | 8 ++++---- 25 files changed, 108 insertions(+), 81 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4b9351120fd8..23a0f0fc83d8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -521,11 +521,38 @@ static inline void qdisc_bstats_update(struct Qdisc *sch, bstats_update(&sch->bstats, skb); } +static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, + const struct sk_buff *skb) +{ + sch->qstats.backlog -= qdisc_pkt_len(skb); +} + +static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, + const struct sk_buff *skb) +{ + sch->qstats.backlog += qdisc_pkt_len(skb); +} + +static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) +{ + sch->qstats.drops += count; +} + +static inline void qdisc_qstats_drop(struct Qdisc *sch) +{ + sch->qstats.drops++; +} + +static inline void qdisc_qstats_overlimit(struct Qdisc *sch) +{ + sch->qstats.overlimits++; +} + static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff_head *list) { __skb_queue_tail(list, skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } @@ -541,7 +568,7 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch, struct sk_buff *skb = __skb_dequeue(list); if (likely(skb != NULL)) { - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); } @@ -560,7 +587,7 @@ static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); - sch->qstats.backlog -= len; + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); return len; } @@ -579,7 +606,7 @@ static inline struct sk_buff *__qdisc_dequeue_tail(struct Qdisc *sch, struct sk_buff *skb = __skb_dequeue_tail(list); if (likely(skb != NULL)) - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); return skb; } @@ -661,14 +688,14 @@ static inline unsigned int qdisc_queue_drop(struct Qdisc *sch) static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { kfree_skb(skb); - sch->qstats.drops++; + qdisc_qstats_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_reshape_fail(struct sk_buff *skb, struct Qdisc *sch) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); #ifdef CONFIG_NET_CLS_ACT if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a95e3b48fa51..2862bc61a358 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -763,7 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; - sch->qstats.drops += drops; + __qdisc_qstats_drop(sch, drops); } } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 01017663e5d8..040212cab988 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -417,7 +417,7 @@ done: if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); if (flow) flow->qstats.drops++; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 22a3a029a911..60432c3d3cd4 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -377,7 +377,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) #endif if (cl == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -395,7 +395,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); cbq_mark_toplevel(q, cl); cl->qstats.drops++; } @@ -650,11 +650,11 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) return 0; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return 0; } - sch->qstats.drops++; + qdisc_qstats_drop(sch); return -1; } #endif @@ -995,7 +995,7 @@ cbq_dequeue(struct Qdisc *sch) */ if (sch->q.qlen) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (q->wd_expires) qdisc_watchdog_schedule(&q->watchdog, now + q->wd_expires); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 8abc2625c3a1..c009eb9045ce 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -127,7 +127,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) if (idx == q->tail) choke_zap_tail_holes(q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); qdisc_tree_decrease_qlen(sch, 1); --sch->q.qlen; @@ -302,7 +302,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->vars.qavg > p->qth_max) { q->vars.qcount = -1; - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (use_harddrop(q) || !use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; @@ -315,7 +315,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->vars.qcount = 0; q->vars.qR = red_random(p); - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -332,7 +332,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->tab[q->tail] = skb; q->tail = (q->tail + 1) & q->tab_mask; ++sch->q.qlen; - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } @@ -345,7 +345,7 @@ congestion_drop: other_drop: if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -365,7 +365,7 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) q->tab[q->head] = NULL; choke_zap_head_holes(q); --sch->q.qlen; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; @@ -460,7 +460,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ntab[tail++] = skb; continue; } - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; qdisc_drop(skb, sch); } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 2f9ab17db85a..de28f8e968e8 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -149,7 +149,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 7a6243c5d270..907b12fd6825 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -360,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = drr_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -369,7 +369,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 485e456c8139..227114f27f94 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -258,7 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) err = qdisc_enqueue(skb, p->q); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return err; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index e15a9eb29087..2e2398cfc694 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* queue full, remove one skb to fulfill the limit */ __qdisc_queue_drop_head(sch, &sch->q); - sch->qstats.drops++; + qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); return NET_XMIT_CN; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index e12f997e1b4c..c9b9fcb53206 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -290,7 +290,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) flow->head = skb->next; skb->next = NULL; flow->qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } return skb; @@ -371,7 +371,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) f->qlen++; if (skb_is_retransmit(skb)) q->stat_tcp_retrans++; - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 105cf5557630..9270e1b2f25d 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -164,8 +164,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) q->backlogs[idx] -= len; kfree_skb(skb); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= len; + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); flow->dropped++; return idx; } @@ -180,7 +180,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -190,7 +190,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow = &q->flows[idx]; flow_queue_add(flow, skb); q->backlogs[idx] += qdisc_pkt_len(skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 12cbc09157fc..a4ca4517cdc8 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -209,7 +209,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -219,7 +219,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (gred_use_harddrop(t) || !gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 209b966b2eed..ad278251d811 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1591,7 +1591,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -1600,7 +1600,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } @@ -1643,7 +1643,7 @@ hfsc_dequeue(struct Qdisc *sch) */ cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); hfsc_schedule_watchdog(sch); return NULL; } @@ -1698,7 +1698,7 @@ hfsc_drop(struct Qdisc *sch) list_move_tail(&cl->dlist, &q->droplist); } cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); sch->q.qlen--; return len; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d85b6812a7d4..15d3aabfe250 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -376,8 +376,8 @@ static unsigned int hhf_drop(struct Qdisc *sch) struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); } @@ -395,7 +395,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) bucket = &q->buckets[idx]; bucket_add(bucket, skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; @@ -457,7 +457,7 @@ begin: if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 0256dee69bd6..c40ab7a98c50 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -586,13 +586,13 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; #endif } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); cl->qstats.drops++; } return ret; @@ -925,7 +925,7 @@ ok: goto ok; } } - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) { if (!test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index b351125f3849..eb5b8445fef9 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -69,7 +69,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) switch (result) { case TC_ACT_SHOT: result = TC_ACT_SHOT; - sch->qstats.drops++; + qdisc_qstats_drop(sch); break; case TC_ACT_STOLEN: case TC_ACT_QUEUED: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 4adbf7fefc09..7f4e1d8504b0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -75,7 +75,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -87,7 +87,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 111d70fddaea..b34331967e02 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,12 +429,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Drop packet? */ if (loss_event(q)) { if (q->ecn && INET_ECN_set_ce(skb)) - sch->qstats.drops++; /* mark packet */ + qdisc_qstats_drop(sch); /* mark packet */ else --count; } if (count == 0) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -478,7 +478,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) return qdisc_reshape_fail(skb, sch); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ @@ -549,15 +549,14 @@ static unsigned int netem_drop(struct Qdisc *sch) sch->q.qlen--; skb->next = NULL; skb->prev = NULL; - len = qdisc_pkt_len(skb); - sch->qstats.backlog -= len; + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); } } if (!len && q->qdisc && q->qdisc->ops->drop) len = q->qdisc->ops->drop(q->qdisc); if (len) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return len; } @@ -575,7 +574,7 @@ tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { deliver: - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; @@ -610,7 +609,7 @@ deliver: if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); qdisc_tree_decrease_qlen(sch, 1); } } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index fefeeb73f15f..33d7a98a7a97 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -232,7 +232,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 68a8f25e30c3..b411e78a02fc 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -77,7 +77,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -89,7 +89,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d59f8574540a..3fb26555c79b 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1229,7 +1229,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = qfq_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -1249,7 +1249,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 633e32defdcc..6c0534cc7758 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -74,7 +74,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -84,7 +84,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; @@ -100,7 +100,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return ret; @@ -142,7 +142,7 @@ static unsigned int red_drop(struct Qdisc *sch) if (child->ops->drop && (len = child->ops->drop(child)) > 0) { q->stats.other++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); sch->q.qlen--; return len; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1562fb2b3f46..5819dd82630d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -290,7 +290,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.queuedrop++; goto drop; } @@ -348,7 +348,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) sfb_skb_cb(skb)->hashes[slot] = 0; if (unlikely(minqlen >= q->max)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.bucketdrop++; goto drop; } @@ -376,7 +376,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } if (sfb_rate_limit(skb, q)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.penaltydrop++; goto drop; } @@ -411,7 +411,7 @@ enqueue: increment_qlen(skb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return ret; @@ -420,7 +420,7 @@ drop: return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 80c36bd54abc..158dfa641d18 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -331,8 +331,8 @@ drop: sfq_dec(q, x); kfree_skb(skb); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= len; + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); return len; } @@ -379,7 +379,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) hash = sfq_classify(skb, sch, &ret); if (hash == 0) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -409,7 +409,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (sfq_prob_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && @@ -426,7 +426,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto congestion_drop; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (sfq_hard_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && @@ -461,7 +461,7 @@ congestion_drop: } enqueue: - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); @@ -520,7 +520,7 @@ next_slot: sfq_dec(q, a); qdisc_bstats_update(sch, skb); sch->q.qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { @@ -586,7 +586,8 @@ static void sfq_rehash(struct Qdisc *sch) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) { -drop: sch->qstats.backlog -= qdisc_pkt_len(skb); +drop: + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); dropped++; continue; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 0c39b754083b..77edffe329c4 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -175,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(segs, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); } else { nb++; } @@ -201,7 +201,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } @@ -216,7 +216,7 @@ static unsigned int tbf_drop(struct Qdisc *sch) if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { sch->q.qlen--; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return len; } @@ -281,7 +281,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) (cf. CSZ, HPFQ, HFSC) */ - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); } return NULL; } -- cgit v1.2.3 From 6401585366326fc0ecbc372ec60d1a15cd8be2f5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:53:57 -0700 Subject: net: sched: restrict use of qstats qlen This removes the use of qstats->qlen variable from the classifiers and makes it an explicit argument to gnet_stats_copy_queue(). The qlen represents the qdisc queue length and is packed into the qstats at the last moment before passnig to user space. By handling it explicitely we avoid, in the percpu stats case, having to figure out which per_cpu variable to put it in. It would probably be best to remove it from qstats completely but qstats is a user space ABI and can't be broken. A future patch could make an internal only qstats structure that would avoid having to allocate an additional u32 variable on the Qdisc struct. This would make the qstats struct 128bits instead of 128+32. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 3 ++- net/core/gen_stats.c | 6 +++++- net/sched/act_api.c | 4 +++- net/sched/sch_api.c | 5 +++-- net/sched/sch_atm.c | 4 +--- net/sched/sch_cbq.c | 3 +-- net/sched/sch_drr.c | 7 +++---- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 3 +-- net/sched/sch_htb.c | 5 +++-- net/sched/sch_mq.c | 4 +--- net/sched/sch_mqprio.c | 9 ++++----- net/sched/sch_multiq.c | 3 +-- net/sched/sch_prio.c | 3 +-- net/sched/sch_qfq.c | 3 +-- net/sched/sch_sfq.c | 2 +- 16 files changed, 32 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index ce3c1281f2a0..de9b3dd5750e 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -40,7 +40,8 @@ void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, struct gnet_stats_rate_est64 *r); -int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q); +int gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue *q, __u32 len); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 5ff8e80fe0bb..ad3ecb6ba835 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -219,6 +219,7 @@ EXPORT_SYMBOL(gnet_stats_copy_rate_est); * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle * @q: queue statistics + * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -227,8 +228,11 @@ EXPORT_SYMBOL(gnet_stats_copy_rate_est); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue *q, __u32 qlen) { + q->qlen = qlen; + if (d->compat_tc_stats) { d->tc_stats.drops = q->drops; d->tc_stats.qlen = q->qlen; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eca4cf9ece2f..2e134093b8ec 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -623,7 +623,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) + gnet_stats_copy_queue(&d, + &p->tcfc_qstats, + p->tcfc_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2862bc61a358..ca00ea8e84dc 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1318,6 +1318,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; + __u32 qlen; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); @@ -1335,7 +1336,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - q->qstats.qlen = q->q.qlen; + qlen = q->q.qlen; stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) @@ -1353,7 +1354,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || - gnet_stats_copy_queue(&d, &q->qstats) < 0) + gnet_stats_copy_queue(&d, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 040212cab988..c145eb6279cc 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -637,10 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - flow->qstats.qlen = flow->q->q.qlen; - if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || - gnet_stats_copy_queue(d, &flow->qstats) < 0) + gnet_stats_copy_queue(d, &flow->qstats, flow->q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 60432c3d3cd4..c610081ffba5 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1594,7 +1594,6 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - cl->qstats.qlen = cl->q->q.qlen; cl->xstats.avgidle = cl->avgidle; cl->xstats.undertime = 0; @@ -1603,7 +1602,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qstats, cl->q->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 907b12fd6825..5835a93905b1 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -275,17 +275,16 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; + __u32 qlen = cl->qdisc->q.qlen; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); - if (cl->qdisc->q.qlen) { + if (qlen) xstats.deficit = cl->deficit; - cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; - } if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qdisc->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9270e1b2f25d..226d73597539 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -550,7 +550,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, &qs) < 0) + if (gnet_stats_copy_queue(d, &qs, 0) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ad278251d811..d364acb2ad07 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1370,7 +1370,6 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct hfsc_class *cl = (struct hfsc_class *)arg; struct tc_hfsc_stats xstats; - cl->qstats.qlen = cl->qdisc->q.qlen; cl->qstats.backlog = cl->qdisc->qstats.backlog; xstats.level = cl->level; xstats.period = cl->cl_vtperiod; @@ -1379,7 +1378,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c40ab7a98c50..3a691fd88f99 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1138,15 +1138,16 @@ static int htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct htb_class *cl = (struct htb_class *)arg; + __u32 qlen = 0; if (!cl->level && cl->un.leaf.q) - cl->qstats.qlen = cl->un.leaf.q->q.qlen; + qlen = cl->un.leaf.q->q.qlen; cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index d3a27fb607af..6416a6942062 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -112,7 +112,6 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -200,9 +199,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - sch->qstats.qlen = sch->q.qlen; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats) < 0) + gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8917372fddc6..03dbeb5e8181 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -236,7 +236,6 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -327,6 +326,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl <= netdev_get_num_tc(dev)) { int i; + __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; @@ -344,9 +344,9 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); + qlen += qdisc->q.qlen; bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; - qstats.qlen += qdisc->qstats.qlen; qstats.backlog += qdisc->qstats.backlog; qstats.drops += qdisc->qstats.drops; qstats.requeues += qdisc->qstats.requeues; @@ -356,15 +356,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || - gnet_stats_copy_queue(d, &qstats) < 0) + gnet_stats_copy_queue(d, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - sch->qstats.qlen = sch->q.qlen; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats) < 0) + gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) return -1; } return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 7f4e1d8504b0..53357b368bff 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -360,9 +360,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - cl_q->qstats.qlen = cl_q->q.qlen; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index b411e78a02fc..4644f55242d2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -324,9 +324,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - cl_q->qstats.qlen = cl_q->q.qlen; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3fb26555c79b..66df9d9e301a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -664,14 +664,13 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct tc_qfq_stats xstats; memset(&xstats, 0, sizeof(xstats)); - cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 158dfa641d18..d4afcbc1c6f7 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -871,7 +871,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.qlen = slot->qlen; qs.backlog = slot->backlog; } - if (gnet_stats_copy_queue(d, &qs) < 0) + if (gnet_stats_copy_queue(d, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } -- cgit v1.2.3 From b0ab6f92752b9f9d8da980506e9df3bd9dcd7ed3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:54:24 -0700 Subject: net: sched: enable per cpu qstats After previous patches to simplify qstats the qstats can be made per cpu with a packed union in Qdisc struct. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 3 ++- include/net/sch_generic.h | 12 ++++++++++- net/core/gen_stats.c | 55 +++++++++++++++++++++++++++++++++++++++++------ net/sched/act_api.c | 2 +- net/sched/sch_api.c | 12 +++++++++-- net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 5 +++-- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 3 ++- net/sched/sch_sfq.c | 2 +- 17 files changed, 87 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index de9b3dd5750e..cbafa3768d48 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -41,7 +41,8 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, struct gnet_stats_rate_est64 *r); int gnet_stats_copy_queue(struct gnet_dump *d, - struct gnet_stats_queue *q, __u32 len); + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 23a0f0fc83d8..f12669819d1a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -90,7 +90,10 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; } __packed; unsigned int __state; - struct gnet_stats_queue qstats; + union { + struct gnet_stats_queue qstats; + struct gnet_stats_queue __percpu *cpu_qstats; + } __packed; struct rcu_head rcu_head; int padded; atomic_t refcnt; @@ -543,6 +546,13 @@ static inline void qdisc_qstats_drop(struct Qdisc *sch) sch->qstats.drops++; } +static inline void qdisc_qstats_drop_cpu(struct Qdisc *sch) +{ + struct gnet_stats_queue *qstats = this_cpu_ptr(sch->cpu_qstats); + + qstats->drops++; +} + static inline void qdisc_qstats_overlimit(struct Qdisc *sch) { sch->qstats.overlimits++; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ad3ecb6ba835..14681b97a4f3 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -215,33 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); +static void +__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) +{ + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + + qstats->qlen = 0; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; + qstats->overlimits += qcpu->overlimits; + } +} + +static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) +{ + if (cpu) { + __gnet_stats_copy_queue_cpu(qstats, cpu); + } else { + qstats->qlen = q->qlen; + qstats->backlog = q->backlog; + qstats->drops = q->drops; + qstats->requeues = q->requeues; + qstats->overlimits = q->overlimits; + } + + qstats->qlen = qlen; +} + /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle + * @cpu_q: per cpu queue statistics * @q: queue statistics * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by - * gnet_stats_start_copy(). + * gnet_stats_start_copy(). Using per cpu queue statistics if + * they are available. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen) { - q->qlen = qlen; + struct gnet_stats_queue qstats = {0}; + + __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); if (d->compat_tc_stats) { - d->tc_stats.drops = q->drops; - d->tc_stats.qlen = q->qlen; - d->tc_stats.backlog = q->backlog; - d->tc_stats.overlimits = q->overlimits; + d->tc_stats.drops = qstats.drops; + d->tc_stats.qlen = qstats.qlen; + d->tc_stats.backlog = qstats.backlog; + d->tc_stats.overlimits = qstats.overlimits; } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + return gnet_stats_copy(d, TCA_STATS_QUEUE, + &qstats, sizeof(qstats)); return 0; } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2e134093b8ec..3d43e4979f27 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -623,7 +623,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, + gnet_stats_copy_queue(&d, NULL, &p->tcfc_qstats, p->tcfc_qstats.qlen) < 0) goto errout; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ca00ea8e84dc..aa8329508dba 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -947,6 +947,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, alloc_percpu(struct gnet_stats_basic_cpu); if (!sch->cpu_bstats) goto err_out4; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) + goto err_out4; } if (tca[TCA_STAB]) { @@ -995,6 +999,7 @@ err_out: err_out4: free_percpu(sch->cpu_bstats); + free_percpu(sch->cpu_qstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. @@ -1313,6 +1318,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); @@ -1349,12 +1355,14 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; - if (qdisc_is_percpu_stats(q)) + if (qdisc_is_percpu_stats(q)) { cpu_bstats = q->cpu_bstats; + cpu_qstats = q->cpu_qstats; + } if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || - gnet_stats_copy_queue(&d, &q->qstats, qlen) < 0) + gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c145eb6279cc..e3e2cc5fd068 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -638,7 +638,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct atm_flow_data *flow = (struct atm_flow_data *)arg; if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || - gnet_stats_copy_queue(d, &flow->qstats, flow->q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c610081ffba5..beeb75f80fdb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1602,7 +1602,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats, cl->q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 5835a93905b1..338706092c27 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -284,7 +284,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 226d73597539..b9ca32ebc1de 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -550,7 +550,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, &qs, 0) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d364acb2ad07..e6c7416d0332 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1378,7 +1378,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats, cl->qdisc->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 3a691fd88f99..f1acb0f60dc3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1147,7 +1147,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 6416a6942062..f3cbaecd283a 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -200,7 +200,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 03dbeb5e8181..3811a745452c 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -356,14 +356,15 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || - gnet_stats_copy_queue(d, &qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, + &sch->qstats, sch->q.qlen) < 0) return -1; } return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 53357b368bff..42dd218871e0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -361,7 +361,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 4644f55242d2..8e5cd34aaa74 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -325,7 +325,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 66df9d9e301a..3ec7e88a43ca 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -670,7 +670,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, + &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index d4afcbc1c6f7..b877140beda5 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -871,7 +871,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.qlen = slot->qlen; qs.backlog = slot->backlog; } - if (gnet_stats_copy_queue(d, &qs, qs.qlen) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } -- cgit v1.2.3 From 2101e533f41a90b25bee17ce969734e26eb0eb55 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Fri, 26 Sep 2014 00:09:19 +0200 Subject: bcma: register bcma as device tree driver This driver is used by the bcm53xx ARM SoC code. Now it is possible to give the address of the chipcommon core in device tree and bcma will search for all the other cores. Signed-off-by: Hauke Mehrtens Acked-by: Arnd Bergmann Signed-off-by: John W. Linville --- Documentation/devicetree/bindings/bus/bcma.txt | 20 +++++++ drivers/bcma/bcma_private.h | 14 +++++ drivers/bcma/host_soc.c | 81 ++++++++++++++++++++++++++ drivers/bcma/main.c | 52 ++++++++++++++++- include/linux/bcma/bcma.h | 2 + 5 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/bus/bcma.txt (limited to 'include') diff --git a/Documentation/devicetree/bindings/bus/bcma.txt b/Documentation/devicetree/bindings/bus/bcma.txt new file mode 100644 index 000000000000..e9070c161172 --- /dev/null +++ b/Documentation/devicetree/bindings/bus/bcma.txt @@ -0,0 +1,20 @@ +Driver for ARM AXI Bus with Broadcom Plugins (bcma) + +Required properties: + +- compatible : brcm,bus-axi + +- reg : iomem address range of chipcommon core + +The cores on the AXI bus are automatically detected by bcma with the +memory ranges they are using and they get registered afterwards. + +Example: + + axi@18000000 { + compatible = "brcm,bus-axi"; + reg = <0x18000000 0x1000>; + ranges = <0x00000000 0x18000000 0x00100000>; + #address-cells = <1>; + #size-cells = <1>; + }; diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index b40be43c6f31..b6412b2d748d 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -88,6 +88,20 @@ extern int __init bcma_host_pci_init(void); extern void __exit bcma_host_pci_exit(void); #endif /* CONFIG_BCMA_HOST_PCI */ +/* host_soc.c */ +#if defined(CONFIG_BCMA_HOST_SOC) && defined(CONFIG_OF) +extern int __init bcma_host_soc_register_driver(void); +extern void __exit bcma_host_soc_unregister_driver(void); +#else +static inline int __init bcma_host_soc_register_driver(void) +{ + return 0; +} +static inline void __exit bcma_host_soc_unregister_driver(void) +{ +} +#endif /* CONFIG_BCMA_HOST_SOC && CONFIG_OF */ + /* driver_pci.c */ u32 bcma_pcie_read(struct bcma_drv_pci *pc, u32 address); diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c index 718e054dd727..335cbcfd945b 100644 --- a/drivers/bcma/host_soc.c +++ b/drivers/bcma/host_soc.c @@ -7,6 +7,9 @@ #include "bcma_private.h" #include "scan.h" +#include +#include +#include #include #include @@ -176,6 +179,7 @@ int __init bcma_host_soc_register(struct bcma_soc *soc) /* Host specific */ bus->hosttype = BCMA_HOSTTYPE_SOC; bus->ops = &bcma_host_soc_ops; + bus->host_pdev = NULL; /* Initialize struct, detect chip */ bcma_init_bus(bus); @@ -195,3 +199,80 @@ int __init bcma_host_soc_init(struct bcma_soc *soc) return err; } + +#ifdef CONFIG_OF +static int bcma_host_soc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct bcma_bus *bus; + int err; + + /* Alloc */ + bus = devm_kzalloc(dev, sizeof(*bus), GFP_KERNEL); + if (!bus) + return -ENOMEM; + + /* Map MMIO */ + bus->mmio = of_iomap(np, 0); + if (!bus->mmio) + return -ENOMEM; + + /* Host specific */ + bus->hosttype = BCMA_HOSTTYPE_SOC; + bus->ops = &bcma_host_soc_ops; + bus->host_pdev = pdev; + + /* Initialize struct, detect chip */ + bcma_init_bus(bus); + + /* Register */ + err = bcma_bus_register(bus); + if (err) + goto err_unmap_mmio; + + platform_set_drvdata(pdev, bus); + + return err; + +err_unmap_mmio: + iounmap(bus->mmio); + return err; +} + +static int bcma_host_soc_remove(struct platform_device *pdev) +{ + struct bcma_bus *bus = platform_get_drvdata(pdev); + + bcma_bus_unregister(bus); + iounmap(bus->mmio); + platform_set_drvdata(pdev, NULL); + + return 0; +} + +static const struct of_device_id bcma_host_soc_of_match[] = { + { .compatible = "brcm,bus-axi", }, + {}, +}; +MODULE_DEVICE_TABLE(of, bcma_host_soc_of_match); + +static struct platform_driver bcma_host_soc_driver = { + .driver = { + .name = "bcma-host-soc", + .of_match_table = bcma_host_soc_of_match, + }, + .probe = bcma_host_soc_probe, + .remove = bcma_host_soc_remove, +}; + +int __init bcma_host_soc_register_driver(void) +{ + return platform_driver_register(&bcma_host_soc_driver); +} + +void __exit bcma_host_soc_unregister_driver(void) +{ + platform_driver_unregister(&bcma_host_soc_driver); +} +#endif /* CONFIG_OF */ diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index c421403cab43..d1656c2f70af 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -10,6 +10,7 @@ #include #include #include +#include MODULE_DESCRIPTION("Broadcom's specific AMBA driver"); MODULE_LICENSE("GPL"); @@ -131,6 +132,43 @@ static bool bcma_is_core_needed_early(u16 core_id) return false; } +#ifdef CONFIG_OF +static struct device_node *bcma_of_find_child_device(struct platform_device *parent, + struct bcma_device *core) +{ + struct device_node *node; + u64 size; + const __be32 *reg; + + if (!parent || !parent->dev.of_node) + return NULL; + + for_each_child_of_node(parent->dev.of_node, node) { + reg = of_get_address(node, 0, &size, NULL); + if (!reg) + continue; + if (of_translate_address(node, reg) == core->addr) + return node; + } + return NULL; +} + +static void bcma_of_fill_device(struct platform_device *parent, + struct bcma_device *core) +{ + struct device_node *node; + + node = bcma_of_find_child_device(parent, core); + if (node) + core->dev.of_node = node; +} +#else +static void bcma_of_fill_device(struct platform_device *parent, + struct bcma_device *core) +{ +} +#endif /* CONFIG_OF */ + static void bcma_register_core(struct bcma_bus *bus, struct bcma_device *core) { int err; @@ -147,7 +185,13 @@ static void bcma_register_core(struct bcma_bus *bus, struct bcma_device *core) break; case BCMA_HOSTTYPE_SOC: core->dev.dma_mask = &core->dev.coherent_dma_mask; - core->dma_dev = &core->dev; + if (bus->host_pdev) { + core->dma_dev = &bus->host_pdev->dev; + core->dev.parent = &bus->host_pdev->dev; + bcma_of_fill_device(bus->host_pdev, core); + } else { + core->dma_dev = &core->dev; + } break; case BCMA_HOSTTYPE_SDIO: break; @@ -528,6 +572,11 @@ static int __init bcma_modinit(void) if (err) return err; + err = bcma_host_soc_register_driver(); + if (err) { + pr_err("SoC host initialization failed\n"); + err = 0; + } #ifdef CONFIG_BCMA_HOST_PCI err = bcma_host_pci_init(); if (err) { @@ -545,6 +594,7 @@ static void __exit bcma_modexit(void) #ifdef CONFIG_BCMA_HOST_PCI bcma_host_pci_exit(); #endif + bcma_host_soc_unregister_driver(); bus_unregister(&bcma_bus_type); } module_exit(bcma_modexit) diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 634597917670..729f48e6b20b 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -323,6 +323,8 @@ struct bcma_bus { struct pci_dev *host_pci; /* Pointer to the SDIO device (only for BCMA_HOSTTYPE_SDIO) */ struct sdio_func *host_sdio; + /* Pointer to platform device (only for BCMA_HOSTTYPE_SOC) */ + struct platform_device *host_pdev; }; struct bcma_chipinfo chipinfo; -- cgit v1.2.3 From e1c00e10e92c04aa637126db2e59b092bd4878f8 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Tue, 30 Sep 2014 12:03:48 +0300 Subject: net/mlx4_core: New init and exit flow for mlx4_core In the new flow, we separate the pci initialization and teardown from the initialization and teardown of the other resources. __mlx4_init_one handles the pci resources initialization. It then calls mlx4_load_one to initialize the remainder of the resources. When removing a device, mlx4_remove_one is invoked. However, now mlx4_remove_one calls mlx4_unload_one to free all the resources except the pci resources. When mlx4_unload_one returns, mlx4_remove_one then frees the pci resources. The above separation will allow us to implement 'reset flow' in the future. It will also enable more EQs for VFs and is a pre-step to the modern API to enable/disable SRIOV. Also added nvfs; an integer array of size MLX4_MAX_PORTS + 1; to the mlx4_dev struct. This new field is used to avoid parsing the num_vfs module parameter each time the mlx4_restart_one is called. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 393 +++++++++++++++++------------- include/linux/mlx4/device.h | 1 + 2 files changed, 220 insertions(+), 174 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 4e9857b409f3..f6c32a947185 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2259,116 +2259,18 @@ static void mlx4_free_ownership(struct mlx4_dev *dev) iounmap(owner); } -static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) +static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data, + int total_vfs, int *nvfs, struct mlx4_priv *priv) { - struct mlx4_priv *priv; struct mlx4_dev *dev; + unsigned sum = 0; int err; int port; - int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; - int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; - const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { - {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; - unsigned total_vfs = 0; - int sriov_initialized = 0; - unsigned int i; + int i; int existing_vfs = 0; - pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); - - err = pci_enable_device(pdev); - if (err) { - dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); - return err; - } - - /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS - * per port, we must limit the number of VFs to 63 (since their are - * 128 MACs) - */ - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; - total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { - nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; - if (nvfs[i] < 0) { - dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); - return -EINVAL; - } - } - for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; - i++) { - prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; - if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { - dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); - return -EINVAL; - } - } - if (total_vfs >= MLX4_MAX_NUM_VF) { - dev_err(&pdev->dev, - "Requested more VF's (%d) than allowed (%d)\n", - total_vfs, MLX4_MAX_NUM_VF - 1); - return -EINVAL; - } - - for (i = 0; i < MLX4_MAX_PORTS; i++) { - if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { - dev_err(&pdev->dev, - "Requested more VF's (%d) for port (%d) than allowed (%d)\n", - nvfs[i] + nvfs[2], i + 1, - MLX4_MAX_NUM_VF_P_PORT - 1); - return -EINVAL; - } - } - - - /* - * Check for BARs. - */ - if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && - !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { - dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", - pci_dev_data, pci_resource_flags(pdev, 0)); - err = -ENODEV; - goto err_disable_pdev; - } - if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { - dev_err(&pdev->dev, "Missing UAR, aborting\n"); - err = -ENODEV; - goto err_disable_pdev; - } - - err = pci_request_regions(pdev, DRV_NAME); - if (err) { - dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); - goto err_disable_pdev; - } - - pci_set_master(pdev); + dev = &priv->dev; - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); - goto err_release_regions; - } - } - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); - goto err_release_regions; - } - } - - /* Allow large DMA segments, up to the firmware limit of 1 GB */ - dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); - - dev = pci_get_drvdata(pdev); - priv = mlx4_priv(dev); - dev->pdev = pdev; INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); @@ -2382,28 +2284,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) dev->rev_id = pdev->revision; dev->numa_node = dev_to_node(&pdev->dev); + /* Detect if this device is a virtual function */ if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { - /* When acting as pf, we normally skip vfs unless explicitly - * requested to probe them. */ - if (total_vfs) { - unsigned vfs_offset = 0; - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && - vfs_offset + nvfs[i] < extended_func_num(pdev); - vfs_offset += nvfs[i], i++) - ; - if (i == sizeof(nvfs)/sizeof(nvfs[0])) { - err = -ENODEV; - goto err_free_dev; - } - if ((extended_func_num(pdev) - vfs_offset) - > prb_vf[i]) { - mlx4_warn(dev, "Skipping virtual function:%d\n", - extended_func_num(pdev)); - err = -ENODEV; - goto err_free_dev; - } - } mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); dev->flags |= MLX4_FLAG_SLAVE; } else { @@ -2413,11 +2296,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = mlx4_get_ownership(dev); if (err) { if (err < 0) - goto err_free_dev; + return err; else { mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n"); - err = -EINVAL; - goto err_free_dev; + return -EINVAL; } } @@ -2429,7 +2311,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) GFP_KERNEL); if (NULL == dev->dev_vfs) { mlx4_err(dev, "Failed to allocate memory for VFs\n"); - err = 0; + err = -ENOMEM; + goto err_free_own; } else { atomic_inc(&pf_loading); existing_vfs = pci_num_vf(pdev); @@ -2445,13 +2328,11 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n", err); atomic_dec(&pf_loading); - err = 0; } else { mlx4_warn(dev, "Running in master mode\n"); dev->flags |= MLX4_FLAG_SRIOV | MLX4_FLAG_MASTER; dev->num_vfs = total_vfs; - sriov_initialized = 1; } } } @@ -2467,7 +2348,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = mlx4_reset(dev); if (err) { mlx4_err(dev, "Failed to reset HCA, aborting\n"); - goto err_rel_own; + goto err_sriov; } } @@ -2517,34 +2398,46 @@ slave_start: /* In master functions, the communication channel must be initialized * after obtaining its address from fw */ if (mlx4_is_master(dev)) { - unsigned sum = 0; - err = mlx4_multi_func_init(dev); - if (err) { - mlx4_err(dev, "Failed to init master mfunc interface, aborting\n"); + int ib_ports = 0; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + if (ib_ports && + (num_vfs_argc > 1 || probe_vfs_argc > 1)) { + mlx4_err(dev, + "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); + err = -EINVAL; + goto err_close; + } + if (dev->caps.num_ports < 2 && + num_vfs_argc > 1) { + err = -EINVAL; + mlx4_err(dev, + "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n", + dev->caps.num_ports); goto err_close; } - if (sriov_initialized) { - int ib_ports = 0; - mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) - ib_ports++; + memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs)); - if (ib_ports && - (num_vfs_argc > 1 || probe_vfs_argc > 1)) { - mlx4_err(dev, - "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); - err = -EINVAL; - goto err_master_mfunc; - } - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]); i++) { - unsigned j; - for (j = 0; j < nvfs[i]; ++sum, ++j) { - dev->dev_vfs[sum].min_port = - i < 2 ? i + 1 : 1; - dev->dev_vfs[sum].n_ports = i < 2 ? 1 : - dev->caps.num_ports; - } + for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) { + unsigned j; + + for (j = 0; j < dev->nvfs[i]; ++sum, ++j) { + dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1; + dev->dev_vfs[sum].n_ports = i < 2 ? 1 : + dev->caps.num_ports; } } + + /* In master functions, the communication channel + * must be initialized after obtaining its address from fw + */ + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n"); + goto err_close; + } } err = mlx4_alloc_eq_table(dev); @@ -2565,7 +2458,7 @@ slave_start: if (!mlx4_is_slave(dev)) { err = mlx4_init_steering(dev); if (err) - goto err_free_eq; + goto err_disable_msix; } err = mlx4_setup_hca(dev); @@ -2625,6 +2518,10 @@ err_steer: if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); +err_disable_msix: + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + err_free_eq: mlx4_free_eq_table(dev); @@ -2641,9 +2538,6 @@ err_master_mfunc: } err_close: - if (dev->flags & MLX4_FLAG_MSI_X) - pci_disable_msix(pdev); - mlx4_close_hca(dev); err_mfunc: @@ -2657,17 +2551,151 @@ err_sriov: if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) pci_disable_sriov(pdev); -err_rel_own: - if (!mlx4_is_slave(dev)) - mlx4_free_ownership(dev); - if (mlx4_is_master(dev) && dev->num_vfs) atomic_dec(&pf_loading); kfree(priv->dev.dev_vfs); -err_free_dev: - kfree(priv); +err_free_own: + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + return err; +} + +static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, + struct mlx4_priv *priv) +{ + int err; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { + {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; + unsigned total_vfs = 0; + unsigned int i; + + pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); + return err; + } + + /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS + * per port, we must limit the number of VFs to 63 (since their are + * 128 MACs) + */ + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; + total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { + nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; + if (nvfs[i] < 0) { + dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; + i++) { + prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; + if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { + dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + if (total_vfs >= MLX4_MAX_NUM_VF) { + dev_err(&pdev->dev, + "Requested more VF's (%d) than allowed (%d)\n", + total_vfs, MLX4_MAX_NUM_VF - 1); + err = -EINVAL; + goto err_disable_pdev; + } + + for (i = 0; i < MLX4_MAX_PORTS; i++) { + if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { + dev_err(&pdev->dev, + "Requested more VF's (%d) for port (%d) than allowed (%d)\n", + nvfs[i] + nvfs[2], i + 1, + MLX4_MAX_NUM_VF_P_PORT - 1); + err = -EINVAL; + goto err_disable_pdev; + } + } + + /* Check for BARs. */ + if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", + pci_dev_data, pci_resource_flags(pdev, 0)); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing UAR, aborting\n"); + err = -ENODEV; + goto err_disable_pdev; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + + /* Allow large DMA segments, up to the firmware limit of 1 GB */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + /* Detect if this device is a virtual function */ + if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. + */ + if (total_vfs) { + unsigned vfs_offset = 0; + + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && + vfs_offset + nvfs[i] < extended_func_num(pdev); + vfs_offset += nvfs[i], i++) + ; + if (i == sizeof(nvfs)/sizeof(nvfs[0])) { + err = -ENODEV; + goto err_release_regions; + } + if ((extended_func_num(pdev) - vfs_offset) + > prb_vf[i]) { + dev_warn(&pdev->dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_release_regions; + } + } + } + + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) + goto err_release_regions; + return 0; err_release_regions: pci_release_regions(pdev); @@ -2682,6 +2710,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx4_priv *priv; struct mlx4_dev *dev; + int ret; printk_once(KERN_INFO "%s", mlx4_version); @@ -2690,13 +2719,18 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; dev = &priv->dev; + dev->pdev = pdev; pci_set_drvdata(pdev, dev); priv->pci_dev_data = id->driver_data; - return __mlx4_init_one(pdev, id->driver_data); + ret = __mlx4_init_one(pdev, id->driver_data, priv); + if (ret) + kfree(priv); + + return ret; } -static void __mlx4_remove_one(struct pci_dev *pdev) +static void mlx4_unload_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); @@ -2775,8 +2809,6 @@ static void __mlx4_remove_one(struct pci_dev *pdev) kfree(dev->caps.qp1_proxy); kfree(dev->dev_vfs); - pci_release_regions(pdev); - pci_disable_device(pdev); memset(priv, 0, sizeof(*priv)); priv->pci_dev_data = pci_dev_data; priv->removed = 1; @@ -2787,7 +2819,9 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - __mlx4_remove_one(pdev); + mlx4_unload_one(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); kfree(priv); pci_set_drvdata(pdev, NULL); } @@ -2796,11 +2830,22 @@ int mlx4_restart_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - int pci_dev_data; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int pci_dev_data, err, total_vfs; pci_dev_data = priv->pci_dev_data; - __mlx4_remove_one(pdev); - return __mlx4_init_one(pdev, pci_dev_data); + total_vfs = dev->num_vfs; + memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs)); + + mlx4_unload_one(pdev); + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) { + mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n", + __func__, pci_name(pdev), err); + return err; + } + + return err; } static const struct pci_device_id mlx4_pci_table[] = { @@ -2854,7 +2899,7 @@ MODULE_DEVICE_TABLE(pci, mlx4_pci_table); static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - __mlx4_remove_one(pdev); + mlx4_unload_one(pdev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; @@ -2866,7 +2911,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) struct mlx4_priv *priv = mlx4_priv(dev); int ret; - ret = __mlx4_init_one(pdev, priv->pci_dev_data); + ret = __mlx4_init_one(pdev, priv->pci_dev_data, priv); return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } @@ -2880,7 +2925,7 @@ static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, - .shutdown = __mlx4_remove_one, + .shutdown = mlx4_unload_one, .remove = mlx4_remove_one, .err_handler = &mlx4_err_handler, }; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 03b5608a4329..b2f8ab9a57c4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -707,6 +707,7 @@ struct mlx4_dev { u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; struct mlx4_vf_dev *dev_vfs; + int nvfs[MLX4_MAX_PORTS + 1]; }; struct mlx4_eqe { -- cgit v1.2.3 From a12a601ed163578084a48708ae376805f79a1ccf Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 30 Sep 2014 09:49:55 +0800 Subject: tcp: Change tcp_slow_start function to return void No caller uses the return value, so make this function return void. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_cong.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 545a79aa4212..ba2f9d03076b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -830,7 +830,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name); -int tcp_slow_start(struct tcp_sock *tp, u32 acked); +void tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); u32 tcp_reno_ssthresh(struct sock *sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a6c8a5775624..b1c5970d47a1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,15 +291,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -int tcp_slow_start(struct tcp_sock *tp, u32 acked) +void tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; - acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); -- cgit v1.2.3 From d0bf4a9e92b9a93ffeeacbd7b6cb83e0ee3dc2ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Sep 2014 13:29:15 -0700 Subject: net: cleanup and document skb fclone layout Lets use a proper structure to clearly document and implement skb fast clones. Then, we might experiment more easily alternative layouts. This patch adds a new skb_fclone_busy() helper, used by tcp and xfrm, to stop leaking of implementation details. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 25 +++++++++++++++++++++++++ net/core/skbuff.c | 41 ++++++++++++++++++++--------------------- net/ipv4/tcp_output.c | 5 +---- net/xfrm/xfrm_policy.c | 4 +--- 4 files changed, 47 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 262efdbc346b..d8f7d74d5a4d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -781,6 +781,31 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, int *errcode, gfp_t gfp_mask); +/* Layout of fast clones : [skb1][skb2][fclone_ref] */ +struct sk_buff_fclones { + struct sk_buff skb1; + + struct sk_buff skb2; + + atomic_t fclone_ref; +}; + +/** + * skb_fclone_busy - check if fclone is busy + * @skb: buffer + * + * Returns true is skb is a fast clone, and its clone is not freed. + */ +static inline bool skb_fclone_busy(const struct sk_buff *skb) +{ + const struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + return skb->fclone == SKB_FCLONE_ORIG && + fclones->skb2.fclone == SKB_FCLONE_CLONE; +} + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4be570a4ab21..a8cebb40699c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,15 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + atomic_set(&fclones->fclone_ref, 1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE; + fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -524,8 +525,7 @@ static void skb_release_data(struct sk_buff *skb) */ static void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: @@ -533,22 +533,21 @@ static void kfree_skbmem(struct sk_buff *skb) break; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; + fclones = container_of(skb, struct sk_buff_fclones, skb2); /* The clone portion is available for * fast-cloning again. */ skb->fclone = SKB_FCLONE_UNAVAILABLE; - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; } } @@ -859,17 +858,18 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { - struct sk_buff *n; + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); + struct sk_buff *n = &fclones->skb2; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); + atomic_inc(&fclones->fclone_ref); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -3240,8 +3240,7 @@ void __init skb_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ee567e9e98c3..8d4eac793700 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2110,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - const struct sk_buff *fclone = skb + 1; - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f623dca6ce30..4c4e457e7888 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1961,10 +1961,8 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - const struct sk_buff *fclone = skb + 1; - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { kfree_skb(skb); return 0; } -- cgit v1.2.3 From 8bce6d7d0d1ede22af334ee241841e9278365278 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Sep 2014 20:22:29 -0700 Subject: udp: Generalize skb_udp_segment skb_udp_segment is the function called from udp4_ufo_fragment to segment a UDP tunnel packet. This function currently assumes segmentation is transparent Ethernet bridging (i.e. VXLAN encapsulation). This patch generalizes the function to operate on either Ethertype or IP protocol. The inner_protocol field must be set to the protocol of the inner header. This can now be either an Ethertype or an IP protocol (in a union). A new flag in the skbuff indicates which type is effective. skb_set_inner_protocol and skb_set_inner_ipproto helper functions were added to set the inner_protocol. These functions are called from the point where the tunnel encapsulation is occuring. When skb_udp_tunnel_segment is called, the function to segment the inner packet is selected based on the inner IP or Ethertype. In the case of an IP protocol encapsulation, the function is derived from inet[6]_offloads. In the case of Ethertype, skb->protocol is set to the inner_protocol and skb_mac_gso_segment is called. (GRE currently does this, but it might be possible to lookup the protocol in offload_base and call the appropriate segmenation function directly). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 26 +++++++++++++++++++++++-- include/net/udp.h | 3 ++- net/ipv4/udp_offload.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- net/ipv6/udp_offload.c | 2 +- 4 files changed, 73 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8f7d74d5a4d..7c5036d11feb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -596,7 +596,8 @@ struct sk_buff { __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; - /* 5 or 7 bit hole */ + __u8 inner_protocol_type:1; + /* 4 or 6 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -632,7 +633,11 @@ struct sk_buff { __u32 reserved_tailroom; }; - __be16 inner_protocol; + union { + __be16 inner_protocol; + __u8 inner_ipproto; + }; + __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; @@ -1762,6 +1767,23 @@ static inline void skb_reserve(struct sk_buff *skb, int len) skb->tail += len; } +#define ENCAP_TYPE_ETHER 0 +#define ENCAP_TYPE_IPPROTO 1 + +static inline void skb_set_inner_protocol(struct sk_buff *skb, + __be16 protocol) +{ + skb->inner_protocol = protocol; + skb->inner_protocol_type = ENCAP_TYPE_ETHER; +} + +static inline void skb_set_inner_ipproto(struct sk_buff *skb, + __u8 ipproto) +{ + skb->inner_ipproto = ipproto; + skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; +} + static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; diff --git a/include/net/udp.h b/include/net/udp.h index 16f4e80f0519..07f9b70962f6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -239,7 +239,8 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features); + netdev_features_t features, + bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 19ebe6a39ddc..8c35f2c939ee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,8 +25,11 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features), + __be16 new_protocol) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -48,7 +51,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); + skb->protocol = new_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); if (need_csum) @@ -56,7 +59,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); + segs = gso_inner_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -101,6 +104,44 @@ out: return segs; } +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + bool is_ipv6) +{ + __be16 protocol = skb->protocol; + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features); + + rcu_read_lock(); + + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = skb->inner_protocol; + gso_inner_segment = skb_mac_gso_segment; + break; + case ENCAP_TYPE_IPPROTO: + offloads = is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[skb->inner_ipproto]); + if (!ops || !ops->callbacks.gso_segment) + goto out_unlock; + gso_inner_segment = ops->callbacks.gso_segment; + break; + default: + goto out_unlock; + } + + segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, + protocol); + +out_unlock: + rcu_read_unlock(); + + return segs; +} + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -113,7 +154,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, false); goto out; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 212ebfc7973f..8f96988c1db2 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -58,7 +58,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, true); else { const struct ipv6hdr *ipv6h; struct udphdr *uh; -- cgit v1.2.3 From a0efb80ce3abacfd22a4284c3730924fc2f1f077 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 30 Sep 2014 16:07:24 -0700 Subject: net_sched: avoid calling tcf_unbind_filter() in call_rcu callback This fixes the following crash: [ 63.976822] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 63.980094] CPU: 1 PID: 15 Comm: ksoftirqd/1 Not tainted 3.17.0-rc6+ #648 [ 63.980094] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 63.980094] task: ffff880117dea690 ti: ffff880117dfc000 task.ti: ffff880117dfc000 [ 63.980094] RIP: 0010:[] [] u32_destroy_key+0x27/0x6d [ 63.980094] RSP: 0018:ffff880117dffcc0 EFLAGS: 00010202 [ 63.980094] RAX: ffff880117dea690 RBX: ffff8800d02e0820 RCX: 0000000000000000 [ 63.980094] RDX: 0000000000000001 RSI: 0000000000000002 RDI: 6b6b6b6b6b6b6b6b [ 63.980094] RBP: ffff880117dffcd0 R08: 0000000000000000 R09: 0000000000000000 [ 63.980094] R10: 00006c0900006ba8 R11: 00006ba100006b9d R12: 0000000000000001 [ 63.980094] R13: ffff8800d02e0898 R14: ffffffff817e6d4d R15: ffff880117387a30 [ 63.980094] FS: 0000000000000000(0000) GS:ffff88011a800000(0000) knlGS:0000000000000000 [ 63.980094] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 63.980094] CR2: 00007f07e6732fed CR3: 000000011665b000 CR4: 00000000000006e0 [ 63.980094] Stack: [ 63.980094] ffff88011a9cd300 ffffffff82051ac0 ffff880117dffce0 ffffffff817e6d68 [ 63.980094] ffff880117dffd70 ffffffff810cb4c7 ffffffff810cb3cd ffff880117dfffd8 [ 63.980094] ffff880117dea690 ffff880117dea690 ffff880117dfffd8 000000000000000a [ 63.980094] Call Trace: [ 63.980094] [] u32_delete_key_freepf_rcu+0x1b/0x1d [ 63.980094] [] rcu_process_callbacks+0x3bb/0x691 [ 63.980094] [] ? rcu_process_callbacks+0x2c1/0x691 [ 63.980094] [] ? u32_destroy_key+0x6d/0x6d [ 63.980094] [] __do_softirq+0x142/0x323 [ 63.980094] [] run_ksoftirqd+0x23/0x53 [ 63.980094] [] smpboot_thread_fn+0x203/0x221 [ 63.980094] [] ? smpboot_unpark_thread+0x33/0x33 [ 63.980094] [] kthread+0xc9/0xd1 [ 63.980094] [] ? do_wait_for_common+0xf8/0x125 [ 63.980094] [] ? __kthread_parkme+0x61/0x61 [ 63.980094] [] ret_from_fork+0x7c/0xb0 [ 63.980094] [] ? __kthread_parkme+0x61/0x61 tp could be freed in call_rcu callback too, the order is not guaranteed. John Fastabend says: ==================== Its worth noting why this is safe. Any running schedulers will either read the valid class field or it will be zeroed. All schedulers today when the class is 0 do a lookup using the same call used by the tcf_exts_bind(). So even if we have a running classifier hit the null class pointer it will do a lookup and get to the same result. This is particularly fragile at the moment because the only way to verify this is to audit the schedulers call sites. ==================== Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 +----- net/sched/cls_u32.c | 10 ++++++---- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 73f9532ee581..ef44ad9a6426 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -20,11 +20,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { - unsigned long old_cl; - - old_cl = *clp; - *clp = cl; - return old_cl; + return xchg(clp, cl); } static inline unsigned long diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4be3ebf46d67..0472909bb014 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -358,7 +358,6 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, bool free_pf) { - tcf_unbind_filter(tp, &n->res); tcf_exts_destroy(&n->exts); if (n->ht_down) n->ht_down->refcnt--; @@ -416,6 +415,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); + tcf_unbind_filter(tp, &key->res); call_rcu(&key->rcu, u32_delete_key_freepf_rcu); return 0; } @@ -425,7 +425,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } -static void u32_clear_hnode(struct tc_u_hnode *ht) +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; unsigned int h; @@ -434,6 +434,7 @@ static void u32_clear_hnode(struct tc_u_hnode *ht) while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); + tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -447,7 +448,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) WARN_ON(ht->refcnt); - u32_clear_hnode(ht); + u32_clear_hnode(tp, ht); hn = &tp_c->hlist; for (phn = rtnl_dereference(*hn); @@ -482,7 +483,7 @@ static void u32_destroy(struct tcf_proto *tp) ht; ht = rtnl_dereference(ht->next)) { ht->refcnt--; - u32_clear_hnode(ht); + u32_clear_hnode(tp, ht); } while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { @@ -731,6 +732,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } u32_replace_knode(tp, tp_c, new); + tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); return 0; } -- cgit v1.2.3 From d068b02cfdfc27f5962ec82ec5568b706f599edc Mon Sep 17 00:00:00 2001 From: Petri Gynther Date: Wed, 1 Oct 2014 11:58:02 -0700 Subject: net: phy: add BCM7425 and BCM7429 PHYs Signed-off-by: Petri Gynther Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 28 ++++++++++++++++++++++++++++ include/linux/brcmphy.h | 2 ++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index daae69950925..1d211d369039 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -350,6 +350,32 @@ static struct phy_driver bcm7xxx_driver[] = { BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"), { + .phy_id = PHY_ID_BCM7425, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM7425", + .features = PHY_GBIT_FEATURES | + SUPPORTED_Pause | SUPPORTED_Asym_Pause, + .flags = 0, + .config_init = bcm7xxx_config_init, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + .suspend = bcm7xxx_suspend, + .resume = bcm7xxx_config_init, + .driver = { .owner = THIS_MODULE }, +}, { + .phy_id = PHY_ID_BCM7429, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM7429", + .features = PHY_GBIT_FEATURES | + SUPPORTED_Pause | SUPPORTED_Asym_Pause, + .flags = PHY_IS_INTERNAL, + .config_init = bcm7xxx_config_init, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + .suspend = bcm7xxx_suspend, + .resume = bcm7xxx_config_init, + .driver = { .owner = THIS_MODULE }, +}, { .phy_id = PHY_BCM_OUI_4, .phy_id_mask = 0xffff0000, .name = "Broadcom BCM7XXX 40nm", @@ -381,6 +407,8 @@ static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = { { PHY_ID_BCM7250, 0xfffffff0, }, { PHY_ID_BCM7364, 0xfffffff0, }, { PHY_ID_BCM7366, 0xfffffff0, }, + { PHY_ID_BCM7425, 0xfffffff0, }, + { PHY_ID_BCM7429, 0xfffffff0, }, { PHY_ID_BCM7439, 0xfffffff0, }, { PHY_ID_BCM7445, 0xfffffff0, }, { PHY_BCM_OUI_4, 0xffff0000 }, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 3f626fe48c9a..7ccd928cc1f2 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -16,6 +16,8 @@ #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7364 0xae025260 #define PHY_ID_BCM7366 0x600d8490 +#define PHY_ID_BCM7425 0x03625e60 +#define PHY_ID_BCM7429 0x600d8730 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3 From 51b0a5d8c21a91801bbef9bcc8639dc0b206c6cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Sep 2014 14:35:14 +0200 Subject: netfilter: nft_reject: introduce icmp code abstraction for inet and bridge This patch introduces the NFT_REJECT_ICMPX_UNREACH type which provides an abstraction to the ICMP and ICMPv6 codes that you can use from the inet and bridge tables, they are: * NFT_REJECT_ICMPX_NO_ROUTE: no route to host - network unreachable * NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable * NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable * NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratevely prohibited You can still use the specific codes when restricting the rule to match the corresponding layer 3 protocol. I decided to not overload the existing NFT_REJECT_ICMP_UNREACH to have different semantics depending on the table family and to allow the user to specify ICMP family specific codes if they restrict it to the corresponding family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 1 + include/net/netfilter/nft_reject.h | 9 +-- include/uapi/linux/netfilter/nf_tables.h | 21 +++++++ net/bridge/netfilter/nft_reject_bridge.c | 95 ++++++++++++++++++++++++++++++-- net/ipv4/netfilter/nft_reject_ipv4.c | 1 - net/netfilter/nft_reject.c | 37 +++++++++++++ net/netfilter/nft_reject_inet.c | 94 +++++++++++++++++++++++++++++-- 7 files changed, 241 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index f713b5a31d62..8ce06385a552 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -5,6 +5,7 @@ #include #include #include +#include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) { diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 36b0da2d55bb..60fa1530006b 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -14,12 +14,7 @@ int nft_reject_init(const struct nft_ctx *ctx, int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); -void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt); - -void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt); +int nft_reject_icmp_code(u8 code); +int nft_reject_icmpv6_code(u8 code); #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b72ccfeaf865..c26df6787fb0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,12 +749,33 @@ enum nft_queue_attributes { * * @NFT_REJECT_ICMP_UNREACH: reject using ICMP unreachable * @NFT_REJECT_TCP_RST: reject using TCP RST + * @NFT_REJECT_ICMPX_UNREACH: abstracted ICMP unreachable for bridge and inet */ enum nft_reject_types { NFT_REJECT_ICMP_UNREACH, NFT_REJECT_TCP_RST, + NFT_REJECT_ICMPX_UNREACH, }; +/** + * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * + * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable + * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable + * @NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable + * @NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratively prohibited + * + * These codes are mapped to real ICMP and ICMPv6 codes. + */ +enum nft_reject_inet_code { + NFT_REJECT_ICMPX_NO_ROUTE = 0, + NFT_REJECT_ICMPX_PORT_UNREACH, + NFT_REJECT_ICMPX_HOST_UNREACH, + NFT_REJECT_ICMPX_ADMIN_PROHIBITED, + __NFT_REJECT_ICMPX_MAX +}; +#define NFT_REJECT_ICMPX_MAX (__NFT_REJECT_ICMPX_MAX + 1) + /** * enum nft_reject_attributes - nf_tables reject expression netlink attributes * diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index ee3ffe93e14e..a76479535df2 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -14,21 +14,106 @@ #include #include #include +#include +#include static void nft_reject_bridge_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (eth_hdr(pkt->skb)->h_proto) { case htons(ETH_P_IP): - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case htons(ETH_P_IPV6): - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; default: /* No explicit way to reject this protocol, drop it. */ - data[NFT_REG_VERDICT].verdict = NF_DROP; break; } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_bridge_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + return 0; +} + +static int nft_reject_bridge_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_bridge_type; @@ -36,8 +121,8 @@ static const struct nft_expr_ops nft_reject_bridge_ops = { .type = &nft_reject_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_bridge_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_bridge_init, + .dump = nft_reject_bridge_dump, }; static struct nft_expr_type nft_reject_bridge_type __read_mostly = { diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e79718a382f2..ed33299c56d1 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f3448c296446..ec8a456092a7 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, @@ -70,5 +72,40 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_reject_dump); +static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, +}; + +int nft_reject_icmp_code(u8 code) +{ + if (code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + return icmp_code_v4[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmp_code); + + +static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, +}; + +int nft_reject_icmpv6_code(u8 code) +{ + if (code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + return icmp_code_v6[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index b718a52a4654..7b5f9d58680a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -14,17 +14,103 @@ #include #include #include +#include +#include static void nft_reject_inet_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (pkt->ops->pf) { case NFPROTO_IPV4: - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case NFPROTO_IPV6: - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; + } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_inet_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; } + return 0; +} + +static int nft_reject_inet_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_inet_type; @@ -32,8 +118,8 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_inet_init, + .dump = nft_reject_inet_dump, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { -- cgit v1.2.3 From c8d7b98bec43faaa6583c3135030be5eb4693acb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Sep 2014 14:35:15 +0200 Subject: netfilter: move nf_send_resetX() code to nf_reject_ipvX modules Move nf_send_reset() and nf_send_reset6() to nf_reject_ipv4 and nf_reject_ipv6 respectively. This code is shared by x_tables and nf_tables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 118 +----------------------- net/ipv4/netfilter/Kconfig | 6 ++ net/ipv4/netfilter/Makefile | 3 + net/ipv4/netfilter/nf_reject_ipv4.c | 127 +++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 ++ net/ipv6/netfilter/Makefile | 3 + net/ipv6/netfilter/nf_reject_ipv6.c | 163 +++++++++++++++++++++++++++++++++ 7 files changed, 309 insertions(+), 117 deletions(-) create mode 100644 net/ipv4/netfilter/nf_reject_ipv4.c create mode 100644 net/ipv6/netfilter/nf_reject_ipv6.c (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 8ce06385a552..e8427193c777 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -1,10 +1,6 @@ #ifndef _IPV4_NF_REJECT_H #define _IPV4_NF_REJECT_H -#include -#include -#include -#include #include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) @@ -12,118 +8,6 @@ static inline void nf_send_unreach(struct sk_buff *skb_in, int code) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } -/* Send RST reply */ -static void nf_send_reset(struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); - if (oth == NULL) - return; - - /* No RST for RST. */ - if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; - - /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; - - skb_reserve(nskb, LL_MAX_HEADER); - - skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - niph->version = 4; - niph->ihl = sizeof(struct iphdr) / 4; - niph->tos = 0; - niph->id = 0; - niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; - niph->check = 0; - niph->saddr = oiph->daddr; - niph->daddr = oiph->saddr; - - skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); - tcph->source = oth->dest; - tcph->dest = oth->source; - tcph->doff = sizeof(struct tcphdr) / 4; - - if (oth->ack) - tcph->seq = oth->ack_seq; - else { - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + - oldskb->len - ip_hdrlen(oldskb) - - (oth->doff << 2)); - tcph->ack = 1; - } - - tcph->rst = 1; - tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, - niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)tcph - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set_noref(nskb, skb_dst(oldskb)); - - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) - goto free_nskb; - - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - - /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) - goto free_nskb; - - nf_ct_attach(nskb, oldskb); - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - /* If we use ip_local_out for bridged traffic, the MAC source on - * the RST will be ours, instead of the destination's. This confuses - * some routers/firewalls, and they drop the packet. So we need to - * build the eth header using the original destination's MAC as the - * source, and send the RST packet directly. - */ - if (oldskb->nf_bridge) { - struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) - goto free_nskb; - dev_queue_xmit(nskb); - } else -#endif - ip_local_out(nskb); - - return; - - free_nskb: - kfree_skb(nskb); -} - +void nf_send_reset(struct sk_buff *oldskb, int hook); #endif /* _IPV4_NF_REJECT_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 345242a79db6..4c019d5c3f57 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,8 +61,13 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. +config NF_REJECT_IPV4 + tristate "IPv4 packet rejection" + default m if NETFILTER_ADVANCED=n + config NFT_REJECT_IPV4 depends on NF_TABLES_IPV4 + select NF_REJECT_IPV4 default NFT_REJECT tristate @@ -208,6 +213,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" depends on IP_NF_FILTER + select NF_REJECT_IPV4 default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 14488cc5fd2c..f4cef5af0969 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -23,6 +23,9 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o +# reject +obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c new file mode 100644 index 000000000000..b023b4eb1a96 --- /dev/null +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -0,0 +1,127 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +/* Send RST reply */ +void nf_send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _otcph, *tcph; + + /* IP header checks: fragment. */ + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + /* Check checksum */ + if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) + return; + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; + + skb_reserve(nskb, LL_MAX_HEADER); + + skb_reset_network_header(nskb); + niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph->version = 4; + niph->ihl = sizeof(struct iphdr) / 4; + niph->tos = 0; + niph->id = 0; + niph->frag_off = htons(IP_DF); + niph->protocol = IPPROTO_TCP; + niph->check = 0; + niph->saddr = oiph->daddr; + niph->daddr = oiph->saddr; + + skb_reset_transport_header(nskb); + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + memset(tcph, 0, sizeof(*tcph)); + tcph->source = oth->dest; + tcph->dest = oth->source; + tcph->doff = sizeof(struct tcphdr) / 4; + + if (oth->ack) + tcph->seq = oth->ack_seq; + else { + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - ip_hdrlen(oldskb) - + (oth->doff << 2)); + tcph->ack = 1; + } + + tcph->rst = 1; + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, + niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)tcph - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + /* ip_route_me_harder expects skb->dst to be set */ + skb_dst_set_noref(nskb, skb_dst(oldskb)); + + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); + + /* "Never happens" */ + if (nskb->len > dst_mtu(skb_dst(nskb))) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + /* If we use ip_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + goto free_nskb; + dev_queue_xmit(nskb); + } else +#endif + ip_local_out(nskb); + + return; + + free_nskb: + kfree_skb(nskb); +} +EXPORT_SYMBOL_GPL(nf_send_reset); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index bb1a40db7be1..6af874fc187f 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -40,8 +40,13 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. +config NF_REJECT_IPV6 + tristate "IPv6 packet rejection" + default m if NETFILTER_ADVANCED=n + config NFT_REJECT_IPV6 depends on NF_TABLES_IPV6 + select NF_REJECT_IPV6 default NFT_REJECT tristate @@ -208,6 +213,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_REJECT tristate "REJECT target support" depends on IP6_NF_FILTER + select NF_REJECT_IPV6 default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMPv6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 0f7e5b3f328d..fbb25f01143c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,6 +27,9 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o # logging obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o +# reject +obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c new file mode 100644 index 000000000000..5f5f0438d74d --- /dev/null +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -0,0 +1,163 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); + struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; + struct dst_entry *dst = NULL; + u8 proto; + __be16 frag_off; + struct flowi6 fl6; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + pr_debug("addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + pr_debug("Cannot get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { + pr_debug("proto(%d) != IPPROTO_TCP, " + "or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + pr_debug("RST is set\n"); + return; + } + + /* Check checksum. */ + if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { + pr_debug("TCP checksum is invalid\n"); + return; + } + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = oip6h->daddr; + fl6.daddr = oip6h->saddr; + fl6.fl6_sport = otcph.dest; + fl6.fl6_dport = otcph.source; + security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + net_dbg_ratelimited("cannot alloc skb\n"); + dst_release(dst); + return; + } + + skb_dst_set(nskb, dst); + + skb_reserve(nskb, hh_len + dst->header_len); + + skb_put(nskb, sizeof(struct ipv6hdr)); + skb_reset_network_header(nskb); + ip6h = ipv6_hdr(nskb); + ip6_flow_hdr(ip6h, tclass, 0); + ip6h->hop_limit = ip6_dst_hoplimit(dst); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->saddr = oip6h->daddr; + ip6h->daddr = oip6h->saddr; + + skb_reset_transport_header(nskb); + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, + &ipv6_hdr(nskb)->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); + + nf_ct_attach(nskb, oldskb); + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + /* If we use ip6_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + nskb->protocol = htons(ETH_P_IPV6); + ip6h->payload_len = htons(sizeof(struct tcphdr)); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + return; + dev_queue_xmit(nskb); + } else +#endif + ip6_local_out(nskb); +} +EXPORT_SYMBOL_GPL(nf_send_reset6); -- cgit v1.2.3 From 4b7fd5d97ee6e599247b4a55122ca6ba80c8148d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Oct 2014 11:13:21 +0200 Subject: netfilter: explicit module dependency between br_netfilter and physdev You can use physdev to match the physical interface enslaved to the bridge device. This information is stored in skb->nf_bridge and it is set up by br_netfilter. So, this is only available when iptables is used from the bridge netfilter path. Since 34666d4 ("netfilter: bridge: move br_netfilter out of the core"), the br_netfilter code is modular. To reduce the impact of this change, we can autoload the br_netfilter if the physdev match is used since we assume that the users need br_netfilter in place. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/br_netfilter.h | 6 ++++++ net/bridge/br_netfilter.c | 5 +++++ net/netfilter/xt_physdev.c | 3 +++ 3 files changed, 14 insertions(+) create mode 100644 include/net/netfilter/br_netfilter.h (limited to 'include') diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h new file mode 100644 index 000000000000..2aa6048a55c1 --- /dev/null +++ b/include/net/netfilter/br_netfilter.h @@ -0,0 +1,6 @@ +#ifndef _BR_NETFILTER_H_ +#define _BR_NETFILTER_H_ + +void br_netfilter_enable(void); + +#endif /* _BR_NETFILTER_H_ */ diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 97e43937aaca..fa1270cc5086 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -856,6 +856,11 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, return NF_ACCEPT; } +void br_netfilter_enable(void) +{ +} +EXPORT_SYMBOL_GPL(br_netfilter_enable); + /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static struct nf_hook_ops br_nf_ops[] __read_mostly = { diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d7ca16b8b8df..f440f57a452f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -13,6 +13,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer "); @@ -87,6 +88,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; + br_netfilter_enable(); + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return -EINVAL; -- cgit v1.2.3 From 07dcc686fa8f6667dec4696804cdb43a90267b9a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 30 Sep 2014 10:50:06 +0900 Subject: ipvs: Clean up comment style in ip_vs.h * Consistently use the multi-line comment style for networking code: /* This * That * The other thing */ * Use single-line comment style for comments with only one line of text. * In general follow the leading '*' of each line of a comment with a single space and then text. * Add missing line break between functions, remove double line break, align comments to previous lines whenever possible. Reported-by: Sergei Shtylyov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 214 ++++++++++++++++++---------------------------------- 1 file changed, 75 insertions(+), 139 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 576d7f0bed5d..615b20b58545 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1,6 +1,5 @@ -/* - * IP Virtual Server - * data structure and functionality definitions +/* IP Virtual Server + * data structure and functionality definitions */ #ifndef _NET_IP_VS_H @@ -12,7 +11,7 @@ #include /* for struct list_head */ #include /* for struct rwlock_t */ -#include /* for struct atomic_t */ +#include /* for struct atomic_t */ #include #include #include @@ -30,15 +29,13 @@ #endif #include /* Netw namespace */ -/* - * Generic access of ipvs struct - */ +/* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) { return net->ipvs; } -/* - * Get net ptr from skb in traffic cases + +/* Get net ptr from skb in traffic cases * use skb_sknet when call is from userland (ioctl or netlink) */ static inline struct net *skb_net(const struct sk_buff *skb) @@ -90,8 +87,8 @@ static inline struct net *skb_sknet(const struct sk_buff *skb) return &init_net; #endif } -/* - * This one needed for single_open_net since net is stored directly in + +/* This one needed for single_open_net since net is stored directly in * private not as a struct i.e. seq_file_net can't be used. */ static inline struct net *seq_file_single_net(struct seq_file *seq) @@ -108,7 +105,7 @@ extern int ip_vs_conn_tab_size; struct ip_vs_iphdr { __u32 len; /* IPv4 simply where L4 starts - IPv6 where L4 Transport Header starts */ + * IPv6 where L4 Transport Header starts */ __u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/ __s16 protocol; __s32 flags; @@ -304,16 +301,11 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, #define LeaveFunction(level) do {} while (0) #endif - -/* - * The port number of FTP service (in network order). - */ +/* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) -/* - * TCP State Values - */ +/* TCP State Values */ enum { IP_VS_TCP_S_NONE = 0, IP_VS_TCP_S_ESTABLISHED, @@ -329,25 +321,19 @@ enum { IP_VS_TCP_S_LAST }; -/* - * UDP State Values - */ +/* UDP State Values */ enum { IP_VS_UDP_S_NORMAL, IP_VS_UDP_S_LAST, }; -/* - * ICMP State Values - */ +/* ICMP State Values */ enum { IP_VS_ICMP_S_NORMAL, IP_VS_ICMP_S_LAST, }; -/* - * SCTP State Values - */ +/* SCTP State Values */ enum ip_vs_sctp_states { IP_VS_SCTP_S_NONE, IP_VS_SCTP_S_INIT1, @@ -366,21 +352,18 @@ enum ip_vs_sctp_states { IP_VS_SCTP_S_LAST }; -/* - * Delta sequence info structure - * Each ip_vs_conn has 2 (output AND input seq. changes). - * Only used in the VS/NAT. +/* Delta sequence info structure + * Each ip_vs_conn has 2 (output AND input seq. changes). + * Only used in the VS/NAT. */ struct ip_vs_seq { __u32 init_seq; /* Add delta from this seq */ __u32 delta; /* Delta in sequence numbers */ __u32 previous_delta; /* Delta in sequence numbers - before last resized pkt */ + * before last resized pkt */ }; -/* - * counters per cpu - */ +/* counters per cpu */ struct ip_vs_counters { __u32 conns; /* connections scheduled */ __u32 inpkts; /* incoming packets */ @@ -388,17 +371,13 @@ struct ip_vs_counters { __u64 inbytes; /* incoming bytes */ __u64 outbytes; /* outgoing bytes */ }; -/* - * Stats per cpu - */ +/* Stats per cpu */ struct ip_vs_cpu_stats { struct ip_vs_counters ustats; struct u64_stats_sync syncp; }; -/* - * IPVS statistics objects - */ +/* IPVS statistics objects */ struct ip_vs_estimator { struct list_head list; @@ -491,9 +470,7 @@ struct ip_vs_protocol { void (*timeout_change)(struct ip_vs_proto_data *pd, int flags); }; -/* - * protocol data per netns - */ +/* protocol data per netns */ struct ip_vs_proto_data { struct ip_vs_proto_data *next; struct ip_vs_protocol *pp; @@ -520,9 +497,7 @@ struct ip_vs_conn_param { __u8 pe_data_len; }; -/* - * IP_VS structure allocated for each dynamically scheduled connection - */ +/* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { struct hlist_node c_list; /* hashed list heads */ /* Protocol, addresses and port numbers */ @@ -561,17 +536,18 @@ struct ip_vs_conn { struct ip_vs_dest *dest; /* real server */ atomic_t in_pkts; /* incoming packet counter */ - /* packet transmitter for different forwarding methods. If it - mangles the packet, it must return NF_DROP or better NF_STOLEN, - otherwise this must be changed to a sk_buff **. - NF_ACCEPT can be returned when destination is local. + /* Packet transmitter for different forwarding methods. If it + * mangles the packet, it must return NF_DROP or better NF_STOLEN, + * otherwise this must be changed to a sk_buff **. + * NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); /* Note: we can group the following members into a structure, - in order to save more space, and the following members are - only used in VS/NAT anyway */ + * in order to save more space, and the following members are + * only used in VS/NAT anyway + */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ struct ip_vs_seq in_seq; /* incoming seq. struct */ @@ -584,9 +560,7 @@ struct ip_vs_conn { struct rcu_head rcu_head; }; -/* - * To save some memory in conn table when name space is disabled. - */ +/* To save some memory in conn table when name space is disabled. */ static inline struct net *ip_vs_conn_net(const struct ip_vs_conn *cp) { #ifdef CONFIG_NET_NS @@ -595,6 +569,7 @@ static inline struct net *ip_vs_conn_net(const struct ip_vs_conn *cp) return &init_net; #endif } + static inline void ip_vs_conn_net_set(struct ip_vs_conn *cp, struct net *net) { #ifdef CONFIG_NET_NS @@ -612,13 +587,12 @@ static inline int ip_vs_conn_net_eq(const struct ip_vs_conn *cp, #endif } -/* - * Extended internal versions of struct ip_vs_service_user and - * ip_vs_dest_user for IPv6 support. +/* Extended internal versions of struct ip_vs_service_user and ip_vs_dest_user + * for IPv6 support. * - * We need these to conveniently pass around service and destination - * options, but unfortunately, we also need to keep the old definitions to - * maintain userspace backwards compatibility for the setsockopt interface. + * We need these to conveniently pass around service and destination + * options, but unfortunately, we also need to keep the old definitions to + * maintain userspace backwards compatibility for the setsockopt interface. */ struct ip_vs_service_user_kern { /* virtual service addresses */ @@ -656,8 +630,8 @@ struct ip_vs_dest_user_kern { /* - * The information about the virtual service offered to the net - * and the forwarding entries + * The information about the virtual service offered to the net and the + * forwarding entries. */ struct ip_vs_service { struct hlist_node s_list; /* for normal service table */ @@ -697,9 +671,8 @@ struct ip_vs_dest_dst { struct rcu_head rcu_head; }; -/* - * The real server destination forwarding entry - * with ip address, port number, and so on. +/* The real server destination forwarding entry with ip address, port number, + * and so on. */ struct ip_vs_dest { struct list_head n_list; /* for the dests in the service */ @@ -738,10 +711,7 @@ struct ip_vs_dest { unsigned int in_rs_table:1; /* we are in rs_table */ }; - -/* - * The scheduler object - */ +/* The scheduler object */ struct ip_vs_scheduler { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ @@ -781,9 +751,7 @@ struct ip_vs_pe { int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); }; -/* - * The application module object (a.k.a. app incarnation) - */ +/* The application module object (a.k.a. app incarnation) */ struct ip_vs_app { struct list_head a_list; /* member in app list */ int type; /* IP_VS_APP_TYPE_xxx */ @@ -799,16 +767,14 @@ struct ip_vs_app { atomic_t usecnt; /* usage counter */ struct rcu_head rcu_head; - /* - * output hook: Process packet in inout direction, diff set for TCP. + /* output hook: Process packet in inout direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); - /* - * input hook: Process packet in outin direction, diff set for TCP. + /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ @@ -867,9 +833,7 @@ struct ipvs_master_sync_state { struct netns_ipvs { int gen; /* Generation */ int enable; /* enable like nf_hooks do */ - /* - * Hash table: for real service lookups - */ + /* Hash table: for real service lookups */ #define IP_VS_RTAB_BITS 4 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) @@ -903,7 +867,7 @@ struct netns_ipvs { struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; #endif /* ip_vs_conn */ - atomic_t conn_count; /* connection counter */ + atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ struct ip_vs_stats tot_stats; /* Statistics & est. */ @@ -990,9 +954,9 @@ struct netns_ipvs { char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* net name space ptr */ struct net *net; /* Needed by timer routines */ - /* Number of heterogeneous destinations, needed because - * heterogeneous are not supported when synchronization is - * enabled */ + /* Number of heterogeneous destinations, needed becaus heterogeneous + * are not supported when synchronization is enabled. + */ unsigned int mixed_address_family_dests; }; @@ -1147,9 +1111,8 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) #endif -/* - * IPVS core functions - * (from ip_vs_core.c) +/* IPVS core functions + * (from ip_vs_core.c) */ const char *ip_vs_proto_name(unsigned int proto); void ip_vs_init_hash_table(struct list_head *table, int rows); @@ -1157,11 +1120,9 @@ void ip_vs_init_hash_table(struct list_head *table, int rows); #define IP_VS_APP_TYPE_FTP 1 -/* - * ip_vs_conn handling functions - * (from ip_vs_conn.c) +/* ip_vs_conn handling functions + * (from ip_vs_conn.c) */ - enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, @@ -1292,9 +1253,7 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } -/* - * IPVS netns init & cleanup functions - */ +/* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct net *net); int ip_vs_control_net_init(struct net *net); int ip_vs_protocol_net_init(struct net *net); @@ -1309,9 +1268,8 @@ void ip_vs_estimator_net_cleanup(struct net *net); void ip_vs_sync_net_cleanup(struct net *net); void ip_vs_service_net_cleanup(struct net *net); -/* - * IPVS application functions - * (from ip_vs_app.c) +/* IPVS application functions + * (from ip_vs_app.c) */ #define IP_VS_APP_MAX_PORTS 8 struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app); @@ -1331,9 +1289,7 @@ int unregister_ip_vs_pe(struct ip_vs_pe *pe); struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); -/* - * Use a #define to avoid all of module.h just for these trivial ops - */ +/* Use a #define to avoid all of module.h just for these trivial ops */ #define ip_vs_pe_get(pe) \ if (pe && pe->module) \ __module_get(pe->module); @@ -1342,9 +1298,7 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); if (pe && pe->module) \ module_put(pe->module); -/* - * IPVS protocol functions (from ip_vs_proto.c) - */ +/* IPVS protocol functions (from ip_vs_proto.c) */ int ip_vs_protocol_init(void); void ip_vs_protocol_cleanup(void); void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); @@ -1362,9 +1316,8 @@ extern struct ip_vs_protocol ip_vs_protocol_esp; extern struct ip_vs_protocol ip_vs_protocol_ah; extern struct ip_vs_protocol ip_vs_protocol_sctp; -/* - * Registering/unregistering scheduler functions - * (from ip_vs_sched.c) +/* Registering/unregistering scheduler functions + * (from ip_vs_sched.c) */ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); @@ -1383,10 +1336,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg); - -/* - * IPVS control data and functions (from ip_vs_ctl.c) - */ +/* IPVS control data and functions (from ip_vs_ctl.c) */ extern struct ip_vs_stats ip_vs_stats; extern int sysctl_ip_vs_sync_ver; @@ -1427,26 +1377,21 @@ static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) kfree(dest); } -/* - * IPVS sync daemon data and function prototypes - * (from ip_vs_sync.c) +/* IPVS sync daemon data and function prototypes + * (from ip_vs_sync.c) */ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid); int stop_sync_thread(struct net *net, int state); void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); -/* - * IPVS rate estimator prototypes (from ip_vs_est.c) - */ +/* IPVS rate estimator prototypes (from ip_vs_est.c) */ void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_stats_user *dst, struct ip_vs_stats *stats); -/* - * Various IPVS packet transmitters (from ip_vs_xmit.c) - */ +/* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, @@ -1477,12 +1422,10 @@ int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, #endif #ifdef CONFIG_SYSCTL -/* - * This is a simple mechanism to ignore packets when - * we are loaded. Just set ip_vs_drop_rate to 'n' and - * we start to drop 1/rate of the packets +/* This is a simple mechanism to ignore packets when + * we are loaded. Just set ip_vs_drop_rate to 'n' and + * we start to drop 1/rate of the packets */ - static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { if (!ipvs->drop_rate) @@ -1496,9 +1439,7 @@ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif -/* - * ip_vs_fwd_tag returns the forwarding tag of the connection - */ +/* ip_vs_fwd_tag returns the forwarding tag of the connection */ #define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) @@ -1557,9 +1498,7 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) return csum_partial(diff, sizeof(diff), oldsum); } -/* - * Forget current conntrack (unconfirmed) and attach notrack entry - */ +/* Forget current conntrack (unconfirmed) and attach notrack entry */ static inline void ip_vs_notrack(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) @@ -1576,9 +1515,8 @@ static inline void ip_vs_notrack(struct sk_buff *skb) } #ifdef CONFIG_IP_VS_NFCT -/* - * Netfilter connection tracking - * (from ip_vs_nfct.c) +/* Netfilter connection tracking + * (from ip_vs_nfct.c) */ static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { @@ -1617,14 +1555,12 @@ static inline int ip_vs_confirm_conntrack(struct sk_buff *skb) static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) { } -/* CONFIG_IP_VS_NFCT */ -#endif +#endif /* CONFIG_IP_VS_NFCT */ static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { - /* - * We think the overhead of processing active connections is 256 + /* We think the overhead of processing active connections is 256 * times higher than that of inactive connections in average. (This * 256 times might not be accurate, we will change it later) We * use the following formula to estimate the overhead now: -- cgit v1.2.3 From dba4b74d2da8798626e2b702ad3f452671e335f7 Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Wed, 1 Oct 2014 15:05:25 +0300 Subject: wil6210: atomic I/O for the card memory Introduce netdev IOCTLs, to be used by the debug tools. Allows to read/write single dword value or memory block, aligned to dword Different address modes supported: - BAR offset - Firmware "linker" address - target's AHB bus Signed-off-by: Vladimir Kondratiev Signed-off-by: John W. Linville --- MAINTAINERS | 1 + drivers/net/wireless/ath/wil6210/Makefile | 1 + drivers/net/wireless/ath/wil6210/ioctl.c | 173 +++++++++++++++++++++++++++++ drivers/net/wireless/ath/wil6210/netdev.c | 12 ++ drivers/net/wireless/ath/wil6210/wil6210.h | 2 + include/uapi/linux/wil6210_uapi.h | 87 +++++++++++++++ 6 files changed, 276 insertions(+) create mode 100644 drivers/net/wireless/ath/wil6210/ioctl.c create mode 100644 include/uapi/linux/wil6210_uapi.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 482888d80431..66de6b2923d4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1611,6 +1611,7 @@ L: wil6210@qca.qualcomm.com S: Supported W: http://wireless.kernel.org/en/users/Drivers/wil6210 F: drivers/net/wireless/ath/wil6210/ +F: include/uapi/linux/wil6210_uapi.h CARL9170 LINUX COMMUNITY WIRELESS DRIVER M: Christian Lamparter diff --git a/drivers/net/wireless/ath/wil6210/Makefile b/drivers/net/wireless/ath/wil6210/Makefile index 05f29a6b1ba8..8ad4b5f97e04 100644 --- a/drivers/net/wireless/ath/wil6210/Makefile +++ b/drivers/net/wireless/ath/wil6210/Makefile @@ -10,6 +10,7 @@ wil6210-y += interrupt.o wil6210-y += txrx.o wil6210-y += debug.o wil6210-y += rx_reorder.o +wil6210-y += ioctl.o wil6210-y += fw.o wil6210-$(CONFIG_WIL6210_TRACING) += trace.o wil6210-y += wil_platform.o diff --git a/drivers/net/wireless/ath/wil6210/ioctl.c b/drivers/net/wireless/ath/wil6210/ioctl.c new file mode 100644 index 000000000000..e9c0673819c6 --- /dev/null +++ b/drivers/net/wireless/ath/wil6210/ioctl.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2014 Qualcomm Atheros, Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "wil6210.h" +#include + +#define wil_hex_dump_ioctl(prefix_str, buf, len) \ + print_hex_dump_debug("DBG[IOC ]" prefix_str, \ + DUMP_PREFIX_OFFSET, 16, 1, buf, len, true) +#define wil_dbg_ioctl(wil, fmt, arg...) wil_dbg(wil, "DBG[IOC ]" fmt, ##arg) + +static void __iomem *wil_ioc_addr(struct wil6210_priv *wil, uint32_t addr, + uint32_t size, enum wil_memio_op op) +{ + void __iomem *a; + u32 off; + + switch (op & wil_mmio_addr_mask) { + case wil_mmio_addr_linker: + a = wmi_buffer(wil, cpu_to_le32(addr)); + break; + case wil_mmio_addr_ahb: + a = wmi_addr(wil, addr); + break; + case wil_mmio_addr_bar: + a = wmi_addr(wil, addr + WIL6210_FW_HOST_OFF); + break; + default: + wil_err(wil, "Unsupported address mode, op = 0x%08x\n", op); + return NULL; + } + + off = a - wil->csr; + if (size >= WIL6210_MEM_SIZE - off) { + wil_err(wil, "Requested block does not fit into memory: " + "off = 0x%08x size = 0x%08x\n", off, size); + return NULL; + } + + return a; +} + +static int wil_ioc_memio_dword(struct wil6210_priv *wil, void __user *data) +{ + struct wil_memio io; + void __iomem *a; + bool need_copy = false; + + if (copy_from_user(&io, data, sizeof(io))) + return -EFAULT; + + wil_dbg_ioctl(wil, "IO: addr = 0x%08x val = 0x%08x op = 0x%08x\n", + io.addr, io.val, io.op); + + a = wil_ioc_addr(wil, io.addr, sizeof(u32), io.op); + if (!a) { + wil_err(wil, "invalid address 0x%08x, op = 0x%08x\n", io.addr, + io.op); + return -EINVAL; + } + /* operation */ + switch (io.op & wil_mmio_op_mask) { + case wil_mmio_read: + io.val = ioread32(a); + need_copy = true; + break; + case wil_mmio_write: + iowrite32(io.val, a); + wmb(); /* make sure write propagated to HW */ + break; + default: + wil_err(wil, "Unsupported operation, op = 0x%08x\n", io.op); + return -EINVAL; + } + + if (need_copy) { + wil_dbg_ioctl(wil, "IO done: addr = 0x%08x" + " val = 0x%08x op = 0x%08x\n", + io.addr, io.val, io.op); + if (copy_to_user(data, &io, sizeof(io))) + return -EFAULT; + } + + return 0; +} + +static int wil_ioc_memio_block(struct wil6210_priv *wil, void __user *data) +{ + struct wil_memio_block io; + void *block; + void __iomem *a; + int rc = 0; + + if (copy_from_user(&io, data, sizeof(io))) + return -EFAULT; + + wil_dbg_ioctl(wil, "IO: addr = 0x%08x size = 0x%08x op = 0x%08x\n", + io.addr, io.size, io.op); + + /* size */ + if (io.size % 4) { + wil_err(wil, "size is not multiple of 4: 0x%08x\n", io.size); + return -EINVAL; + } + + a = wil_ioc_addr(wil, io.addr, io.size, io.op); + if (!a) { + wil_err(wil, "invalid address 0x%08x, op = 0x%08x\n", io.addr, + io.op); + return -EINVAL; + } + + block = kmalloc(io.size, GFP_USER); + if (!block) + return -ENOMEM; + + /* operation */ + switch (io.op & wil_mmio_op_mask) { + case wil_mmio_read: + wil_memcpy_fromio_32(block, a, io.size); + wil_hex_dump_ioctl("Read ", block, io.size); + if (copy_to_user(io.block, block, io.size)) { + rc = -EFAULT; + goto out_free; + } + break; + case wil_mmio_write: + if (copy_from_user(block, io.block, io.size)) { + rc = -EFAULT; + goto out_free; + } + wil_memcpy_toio_32(a, block, io.size); + wmb(); /* make sure write propagated to HW */ + wil_hex_dump_ioctl("Write ", block, io.size); + break; + default: + wil_err(wil, "Unsupported operation, op = 0x%08x\n", io.op); + rc = -EINVAL; + break; + } + +out_free: + kfree(block); + return rc; +} + +int wil_ioctl(struct wil6210_priv *wil, void __user *data, int cmd) +{ + switch (cmd) { + case WIL_IOCTL_MEMIO: + return wil_ioc_memio_dword(wil, data); + case WIL_IOCTL_MEMIO_BLOCK: + return wil_ioc_memio_block(wil, data); + default: + wil_dbg_ioctl(wil, "Unsupported IOCTL 0x%04x\n", cmd); + return -ENOIOCTLCMD; + } +} diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c index c3f0ddfaa5da..239965106c05 100644 --- a/drivers/net/wireless/ath/wil6210/netdev.c +++ b/drivers/net/wireless/ath/wil6210/netdev.c @@ -52,6 +52,17 @@ static int wil_change_mtu(struct net_device *ndev, int new_mtu) return 0; } +static int wil_do_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd) +{ + struct wil6210_priv *wil = ndev_to_wil(ndev); + + int ret = wil_ioctl(wil, ifr->ifr_data, cmd); + + wil_dbg_misc(wil, "ioctl(0x%04x) -> %d\n", cmd, ret); + + return ret; +} + static const struct net_device_ops wil_netdev_ops = { .ndo_open = wil_open, .ndo_stop = wil_stop, @@ -59,6 +70,7 @@ static const struct net_device_ops wil_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = wil_change_mtu, + .ndo_do_ioctl = wil_do_ioctl, }; static int wil6210_netdev_poll_rx(struct napi_struct *napi, int budget) diff --git a/drivers/net/wireless/ath/wil6210/wil6210.h b/drivers/net/wireless/ath/wil6210/wil6210.h index 2991609885f7..61cceca155a2 100644 --- a/drivers/net/wireless/ath/wil6210/wil6210.h +++ b/drivers/net/wireless/ath/wil6210/wil6210.h @@ -595,5 +595,7 @@ void wil6210_unmask_irq_rx(struct wil6210_priv *wil); int wil_iftype_nl2wmi(enum nl80211_iftype type); +int wil_ioctl(struct wil6210_priv *wil, void __user *data, int cmd); int wil_request_firmware(struct wil6210_priv *wil, const char *name); + #endif /* __WIL6210_H__ */ diff --git a/include/uapi/linux/wil6210_uapi.h b/include/uapi/linux/wil6210_uapi.h new file mode 100644 index 000000000000..6a3cddd156c4 --- /dev/null +++ b/include/uapi/linux/wil6210_uapi.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 Qualcomm Atheros, Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __WIL6210_UAPI_H__ +#define __WIL6210_UAPI_H__ + +#if !defined(__KERNEL__) +#define __user +#endif + +#include + +/* Numbers SIOCDEVPRIVATE and SIOCDEVPRIVATE + 1 + * are used by Android devices to implement PNO (preferred network offload). + * Albeit it is temporary solution, use different numbers to avoid conflicts + */ + +/** + * Perform 32-bit I/O operation to the card memory + * + * User code should arrange data in memory like this: + * + * struct wil_memio io; + * struct ifreq ifr = { + * .ifr_data = &io, + * }; + */ +#define WIL_IOCTL_MEMIO (SIOCDEVPRIVATE + 2) + +/** + * Perform block I/O operation to the card memory + * + * User code should arrange data in memory like this: + * + * void *buf; + * struct wil_memio_block io = { + * .block = buf, + * }; + * struct ifreq ifr = { + * .ifr_data = &io, + * }; + */ +#define WIL_IOCTL_MEMIO_BLOCK (SIOCDEVPRIVATE + 3) + +/** + * operation to perform + * + * @wil_mmio_op_mask - bits defining operation, + * @wil_mmio_addr_mask - bits defining addressing mode + */ +enum wil_memio_op { + wil_mmio_read = 0, + wil_mmio_write = 1, + wil_mmio_op_mask = 0xff, + wil_mmio_addr_linker = 0 << 8, + wil_mmio_addr_ahb = 1 << 8, + wil_mmio_addr_bar = 2 << 8, + wil_mmio_addr_mask = 0xff00, +}; + +struct wil_memio { + uint32_t op; /* enum wil_memio_op */ + uint32_t addr; /* should be 32-bit aligned */ + uint32_t val; +}; + +struct wil_memio_block { + uint32_t op; /* enum wil_memio_op */ + uint32_t addr; /* should be 32-bit aligned */ + uint32_t size; /* should be multiple of 4 */ + void __user *block; /* block address */ +}; + +#endif /* __WIL6210_UAPI_H__ */ -- cgit v1.2.3 From 5772e9a3463b264cee5a4e73ef586ad482d7ba48 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Oct 2014 22:35:59 +0200 Subject: qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE Based on DaveM's recent API work on dev_hard_start_xmit(), that allows sending/processing an entire skb list. This patch implements qdisc bulk dequeue, by allowing multiple packets to be dequeued in dequeue_skb(). The optimization principle for this is two fold, (1) to amortize locking cost and (2) avoid expensive tailptr update for notifying HW. (1) Several packets are dequeued while holding the qdisc root_lock, amortizing locking cost over several packet. The dequeued SKB list is processed under the TXQ lock in dev_hard_start_xmit(), thus also amortizing the cost of the TXQ lock. (2) Further more, dev_hard_start_xmit() will utilize the skb->xmit_more API to delay HW tailptr update, which also reduces the cost per packet. One restriction of the new API is that every SKB must belong to the same TXQ. This patch takes the easy way out, by restricting bulk dequeue to qdisc's with the TCQ_F_ONETXQUEUE flag, that specifies the qdisc only have attached a single TXQ. Some detail about the flow; dev_hard_start_xmit() will process the skb list, and transmit packets individually towards the driver (see xmit_one()). In case the driver stops midway in the list, the remaining skb list is returned by dev_hard_start_xmit(). In sch_direct_xmit() this returned list is requeued by dev_requeue_skb(). To avoid overshooting the HW limits, which results in requeuing, the patch limits the amount of bytes dequeued, based on the drivers BQL limits. In-effect bulking will only happen for BQL enabled drivers. Small amounts for extra HoL blocking (2x MTU/0.24ms) were measured at 100Mbit/s, with bulking 8 packets, but the oscillating nature of the measurement indicate something, like sched latency might be causing this effect. More comparisons show, that this oscillation goes away occationally. Thus, we disregard this artifact completely and remove any "magic" bulking limit. For now, as a conservative approach, stop bulking when seeing TSO and segmented GSO packets. They already benefit from bulking on their own. A followup patch add this, to allow easier bisect-ability for finding regressions. Jointed work with Hannes, Daniel and Florian. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 16 ++++++++++++++++ net/sched/sch_generic.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f12669819d1a..d17ed6fb2f70 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -119,6 +120,21 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) qdisc->__state &= ~__QDISC___STATE_RUNNING; } +static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) +{ + return qdisc->flags & TCQ_F_ONETXQUEUE; +} + +static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) +{ +#ifdef CONFIG_BQL + /* Non-BQL migrated drivers will return 0, too. */ + return dql_avail(&txq->dql); +#else + return 0; +#endif +} + static inline bool qdisc_is_throttled(const struct Qdisc *qdisc) { return test_bit(__QDISC_STATE_THROTTLED, &qdisc->state) ? true : false; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7c8e5d73d433..c2e87e63b832 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -56,6 +56,41 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } +static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, + struct sk_buff *head_skb, + int bytelimit) +{ + struct sk_buff *skb, *tail_skb = head_skb; + + while (bytelimit > 0) { + /* For now, don't bulk dequeue GSO (or GSO segmented) pkts */ + if (tail_skb->next || skb_is_gso(tail_skb)) + break; + + skb = q->dequeue(q); + if (!skb) + break; + + bytelimit -= skb->len; /* covers GSO len */ + skb = validate_xmit_skb(skb, qdisc_dev(q)); + if (!skb) + break; + + /* "skb" can be a skb list after validate call above + * (GSO segmented), but it is okay to append it to + * current tail_skb->next, because next round will exit + * in-case "tail_skb->next" is a skb list. + */ + tail_skb->next = skb; + tail_skb = skb; + } + + return head_skb; +} + +/* Note that dequeue_skb can possibly return a SKB list (via skb->next). + * A requeued skb (via q->gso_skb) can also be a SKB list. + */ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { struct sk_buff *skb = q->gso_skb; @@ -70,10 +105,17 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) } else skb = NULL; } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { + if (!(q->flags & TCQ_F_ONETXQUEUE) || + !netif_xmit_frozen_or_stopped(txq)) { + int bytelimit = qdisc_avail_bulklimit(txq); + skb = q->dequeue(q); - if (skb) + if (skb) { + bytelimit -= skb->len; skb = validate_xmit_skb(skb, qdisc_dev(q)); + } + if (skb && qdisc_may_bulk(q)) + skb = try_bulk_dequeue_skb(q, skb, bytelimit); } } -- cgit v1.2.3 From 55a93b3ea780908b7d1b3a8cf1976223a9268d78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Oct 2014 15:31:07 -0700 Subject: qdisc: validate skb without holding lock Validation of skb can be pretty expensive : GSO segmentation and/or checksum computations. We can do this without holding qdisc lock, so that other cpus can queue additional packets. Trick is that requeued packets were already validated, so we carry a boolean so that sch_direct_xmit() can validate a fresh skb list, or directly use an old one. Tested on 40Gb NIC (8 TX queues) and 200 concurrent flows, 48 threads host. Turning TSO on or off had no effect on throughput, only few more cpu cycles. Lock contention on qdisc lock disappeared. Same if disabling TX checksum offload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/net/pkt_sched.h | 2 +- net/core/dev.c | 29 +++++++++++++++++++--- net/sched/sch_generic.c | 61 ++++++++++++++++++++++------------------------- 4 files changed, 56 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9b7fbacb6296..910fb17ad148 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2821,7 +2821,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); -struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 8bbe626e9ece..e4b3c828c1c2 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -99,7 +99,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock); + spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); diff --git a/net/core/dev.c b/net/core/dev.c index e55c546717d4..1a90530f83ff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2655,7 +2655,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur return skb; } -struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2720,6 +2720,30 @@ out_null: return NULL; } +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *next, *head = NULL, *tail; + + while (skb) { + next = skb->next; + skb->next = NULL; + skb = validate_xmit_skb(skb, dev); + if (skb) { + struct sk_buff *end = skb; + + while (end->next) + end = end->next; + if (!head) + head = skb; + else + tail->next = skb; + tail = end; + } + skb = next; + } + return head; +} + static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2786,8 +2810,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); - skb = validate_xmit_skb(skb, dev); - if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 797ebef73642..2b349a4de3c8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -56,40 +56,34 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } -static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, - struct sk_buff *head_skb, - int bytelimit) +static void try_bulk_dequeue_skb(struct Qdisc *q, + struct sk_buff *skb, + const struct netdev_queue *txq) { - struct sk_buff *skb, *tail_skb = head_skb; + int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; while (bytelimit > 0) { - skb = q->dequeue(q); - if (!skb) - break; + struct sk_buff *nskb = q->dequeue(q); - bytelimit -= skb->len; /* covers GSO len */ - skb = validate_xmit_skb(skb, qdisc_dev(q)); - if (!skb) + if (!nskb) break; - while (tail_skb->next) /* GSO list goto tail */ - tail_skb = tail_skb->next; - - tail_skb->next = skb; - tail_skb = skb; + bytelimit -= nskb->len; /* covers GSO len */ + skb->next = nskb; + skb = nskb; } - - return head_skb; + skb->next = NULL; } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ -static inline struct sk_buff *dequeue_skb(struct Qdisc *q) +static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate) { struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + *validate = true; if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); @@ -98,21 +92,16 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) q->q.qlen--; } else skb = NULL; + /* skb in gso_skb were already validated */ + *validate = false; } else { if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { - int bytelimit = qdisc_avail_bulklimit(txq); - skb = q->dequeue(q); - if (skb) { - bytelimit -= skb->len; - skb = validate_xmit_skb(skb, qdisc_dev(q)); - } if (skb && qdisc_may_bulk(q)) - skb = try_bulk_dequeue_skb(q, skb, bytelimit); + try_bulk_dequeue_skb(q, skb, txq); } } - return skb; } @@ -156,19 +145,24 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, */ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock) + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; /* And release qdisc */ spin_unlock(root_lock); - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) - skb = dev_hard_start_xmit(skb, dev, txq, &ret); + /* Note that we validate skb (GSO, checksum, ...) outside of locks */ + if (validate) + skb = validate_xmit_skb_list(skb, dev); - HARD_TX_UNLOCK(dev, txq); + if (skb) { + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_stopped(txq)) + skb = dev_hard_start_xmit(skb, dev, txq, &ret); + HARD_TX_UNLOCK(dev, txq); + } spin_lock(root_lock); if (dev_xmit_complete(ret)) { @@ -217,9 +211,10 @@ static inline int qdisc_restart(struct Qdisc *q) struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; + bool validate; /* Dequeue packet */ - skb = dequeue_skb(q); + skb = dequeue_skb(q, &validate); if (unlikely(!skb)) return 0; @@ -229,7 +224,7 @@ static inline int qdisc_restart(struct Qdisc *q) dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock); + return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) -- cgit v1.2.3 From c7a08ac7ee68b9af0d5af99c7b34b574cac4d144 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:42 +0300 Subject: net/mlx5_core: Update device capabilities handling Rearrange struct mlx5_caps so it has a "gen" field to represent the current capabilities configured for the device. Max capabilities can also be queried from the device. Also update capabilities struct to contain more fields as per the latest revision if firmware specification. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx5/cq.c | 8 +- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 83 ++++++---- drivers/infiniband/hw/mlx5/qp.c | 72 +++++--- drivers/infiniband/hw/mlx5/srq.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 24 ++- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 81 +-------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 219 +++++++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/uar.c | 4 +- include/linux/mlx5/device.h | 24 ++- include/linux/mlx5/driver.h | 28 +++- 12 files changed, 331 insertions(+), 222 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index e4056279166d..10cfce5119a9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -752,7 +752,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, return ERR_PTR(-EINVAL); entries = roundup_pow_of_two(entries + 1); - if (entries > dev->mdev->caps.max_cqes) + if (entries > dev->mdev->caps.gen.max_cqes) return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -919,7 +919,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) int err; u32 fsel; - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) return -ENOSYS; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -1074,7 +1074,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) int uninitialized_var(cqe_size); unsigned long flags; - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { pr_info("Firmware does not support resize CQ\n"); return -ENOSYS; } @@ -1083,7 +1083,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) return -EINVAL; entries = roundup_pow_of_two(entries + 1); - if (entries > dev->mdev->caps.max_cqes + 1) + if (entries > dev->mdev->caps.gen.max_cqes + 1) return -EINVAL; if (entries == ibcq->cqe + 1) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index b514bbb5610f..657af9a1167c 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -129,7 +129,7 @@ int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) packet_error = be16_to_cpu(out_mad->status); - dev->mdev->caps.ext_port_cap[port - 1] = (!err && !packet_error) ? + dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ? MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; out: diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d8907b20522a..f3114d1132fb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -157,11 +157,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; int err = -ENOMEM; int max_rq_sg; int max_sq_sg; u64 flags; + gen = &dev->mdev->caps.gen; in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); if (!in_mad || !out_mad) @@ -183,7 +185,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; - flags = dev->mdev->caps.flags; + flags = gen->flags; if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR) @@ -213,30 +215,31 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; - props->page_size_cap = dev->mdev->caps.min_page_sz; - props->max_qp = 1 << dev->mdev->caps.log_max_qp; - props->max_qp_wr = dev->mdev->caps.max_wqes; - max_rq_sg = dev->mdev->caps.max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); - max_sq_sg = (dev->mdev->caps.max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / + props->page_size_cap = gen->min_page_sz; + props->max_qp = 1 << gen->log_max_qp; + props->max_qp_wr = gen->max_wqes; + max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); - props->max_cq = 1 << dev->mdev->caps.log_max_cq; - props->max_cqe = dev->mdev->caps.max_cqes - 1; - props->max_mr = 1 << dev->mdev->caps.log_max_mkey; - props->max_pd = 1 << dev->mdev->caps.log_max_pd; - props->max_qp_rd_atom = dev->mdev->caps.max_ra_req_qp; - props->max_qp_init_rd_atom = dev->mdev->caps.max_ra_res_qp; + props->max_cq = 1 << gen->log_max_cq; + props->max_cqe = gen->max_cqes - 1; + props->max_mr = 1 << gen->log_max_mkey; + props->max_pd = 1 << gen->log_max_pd; + props->max_qp_rd_atom = 1 << gen->log_max_ra_req_qp; + props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp; + props->max_srq = 1 << gen->log_max_srq; + props->max_srq_wr = gen->max_srq_wqes - 1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = 1 << dev->mdev->caps.log_max_srq; - props->max_srq_wr = dev->mdev->caps.max_srq_wqes - 1; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; - props->local_ca_ack_delay = dev->mdev->caps.local_ca_ack_delay; + props->local_ca_ack_delay = gen->local_ca_ack_delay; props->atomic_cap = IB_ATOMIC_NONE; props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); - props->max_mcast_grp = 1 << dev->mdev->caps.log_max_mcg; - props->max_mcast_qp_attach = dev->mdev->caps.max_qp_mcg; + props->max_mcast_grp = 1 << gen->log_max_mcg; + props->max_mcast_qp_attach = gen->max_qp_mcg; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ @@ -254,10 +257,12 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; int ext_active_speed; int err = -ENOMEM; - if (port < 1 || port > dev->mdev->caps.num_ports) { + gen = &dev->mdev->caps.gen; + if (port < 1 || port > gen->num_ports) { mlx5_ib_warn(dev, "invalid port number %d\n", port); return -EINVAL; } @@ -288,8 +293,8 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); props->gid_tbl_len = out_mad->data[50]; - props->max_msg_sz = 1 << to_mdev(ibdev)->mdev->caps.log_max_msg; - props->pkey_tbl_len = to_mdev(ibdev)->mdev->caps.port[port - 1].pkey_table_len; + props->max_msg_sz = 1 << gen->log_max_msg; + props->pkey_tbl_len = gen->port[port - 1].pkey_table_len; props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; @@ -316,7 +321,7 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == 4) { - if (dev->mdev->caps.ext_port_cap[port - 1] & + if (gen->ext_port_cap[port - 1] & MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { init_query_mad(in_mad); in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; @@ -470,6 +475,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; + struct mlx5_general_caps *gen; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; @@ -480,6 +486,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, int i; size_t reqlen; + gen = &dev->mdev->caps.gen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); @@ -512,14 +519,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; - resp.qp_tab_size = 1 << dev->mdev->caps.log_max_qp; - resp.bf_reg_size = dev->mdev->caps.bf_reg_size; + resp.qp_tab_size = 1 << gen->log_max_qp; + resp.bf_reg_size = gen->bf_reg_size; resp.cache_line_size = L1_CACHE_BYTES; - resp.max_sq_desc_sz = dev->mdev->caps.max_sq_desc_sz; - resp.max_rq_desc_sz = dev->mdev->caps.max_rq_desc_sz; - resp.max_send_wqebb = dev->mdev->caps.max_wqes; - resp.max_recv_wr = dev->mdev->caps.max_wqes; - resp.max_srq_recv_wr = dev->mdev->caps.max_srq_wqes; + resp.max_sq_desc_sz = gen->max_sq_desc_sz; + resp.max_rq_desc_sz = gen->max_rq_desc_sz; + resp.max_send_wqebb = gen->max_wqes; + resp.max_recv_wr = gen->max_wqes; + resp.max_srq_recv_wr = gen->max_srq_wqes; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) @@ -565,7 +572,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; - resp.num_ports = dev->mdev->caps.num_ports; + resp.num_ports = gen->num_ports; err = ib_copy_to_udata(udata, &resp, sizeof(resp) - sizeof(resp.reserved)); if (err) @@ -967,9 +974,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, static void get_ext_port_caps(struct mlx5_ib_dev *dev) { + struct mlx5_general_caps *gen; int port; - for (port = 1; port <= dev->mdev->caps.num_ports; port++) + gen = &dev->mdev->caps.gen; + for (port = 1; port <= gen->num_ports; port++) mlx5_query_ext_port_caps(dev, port); } @@ -977,9 +986,11 @@ static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; + struct mlx5_general_caps *gen; int err = 0; int port; + gen = &dev->mdev->caps.gen; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; @@ -994,14 +1005,14 @@ static int get_port_caps(struct mlx5_ib_dev *dev) goto out; } - for (port = 1; port <= dev->mdev->caps.num_ports; port++) { + for (port = 1; port <= gen->num_ports; port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } - dev->mdev->caps.port[port - 1].pkey_table_len = dprops->max_pkeys; - dev->mdev->caps.port[port - 1].gid_table_len = pprops->gid_tbl_len; + gen->port[port - 1].pkey_table_len = dprops->max_pkeys; + gen->port[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } @@ -1279,8 +1290,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; - dev->ib_dev.local_dma_lkey = mdev->caps.reserved_lkey; - dev->num_ports = mdev->caps.num_ports; + dev->ib_dev.local_dma_lkey = mdev->caps.gen.reserved_lkey; + dev->num_ports = mdev->caps.gen.num_ports; dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; @@ -1355,7 +1366,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; - if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) { + if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8c574b63d77b..dbfe498870c1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -158,11 +158,13 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) { + struct mlx5_general_caps *gen; int wqe_size; int wq_size; + gen = &dev->mdev->caps.gen; /* Sanity check RQ size before proceeding */ - if (cap->max_recv_wr > dev->mdev->caps.max_wqes) + if (cap->max_recv_wr > gen->max_wqes) return -EINVAL; if (!has_rq) { @@ -182,10 +184,10 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); qp->rq.wqe_cnt = wq_size / wqe_size; - if (wqe_size > dev->mdev->caps.max_rq_desc_sz) { + if (wqe_size > gen->max_rq_desc_sz) { mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", wqe_size, - dev->mdev->caps.max_rq_desc_sz); + gen->max_rq_desc_sz); return -EINVAL; } qp->rq.wqe_shift = ilog2(wqe_size); @@ -266,9 +268,11 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, struct mlx5_ib_qp *qp) { + struct mlx5_general_caps *gen; int wqe_size; int wq_size; + gen = &dev->mdev->caps.gen; if (!attr->cap.max_send_wr) return 0; @@ -277,9 +281,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, if (wqe_size < 0) return wqe_size; - if (wqe_size > dev->mdev->caps.max_sq_desc_sz) { + if (wqe_size > gen->max_sq_desc_sz) { mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", - wqe_size, dev->mdev->caps.max_sq_desc_sz); + wqe_size, gen->max_sq_desc_sz); return -EINVAL; } @@ -292,9 +296,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; - if (qp->sq.wqe_cnt > dev->mdev->caps.max_wqes) { + if (qp->sq.wqe_cnt > gen->max_wqes) { mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", - qp->sq.wqe_cnt, dev->mdev->caps.max_wqes); + qp->sq.wqe_cnt, gen->max_wqes); return -ENOMEM; } qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); @@ -309,11 +313,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) { + struct mlx5_general_caps *gen; int desc_sz = 1 << qp->sq.wqe_shift; - if (desc_sz > dev->mdev->caps.max_sq_desc_sz) { + gen = &dev->mdev->caps.gen; + if (desc_sz > gen->max_sq_desc_sz) { mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", - desc_sz, dev->mdev->caps.max_sq_desc_sz); + desc_sz, gen->max_sq_desc_sz); return -EINVAL; } @@ -325,9 +331,9 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, qp->sq.wqe_cnt = ucmd->sq_wqe_count; - if (qp->sq.wqe_cnt > dev->mdev->caps.max_wqes) { + if (qp->sq.wqe_cnt > gen->max_wqes) { mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", - qp->sq.wqe_cnt, dev->mdev->caps.max_wqes); + qp->sq.wqe_cnt, gen->max_wqes); return -EINVAL; } @@ -803,16 +809,18 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_resources *devr = &dev->devr; struct mlx5_ib_create_qp_resp resp; struct mlx5_create_qp_mbox_in *in; + struct mlx5_general_caps *gen; struct mlx5_ib_create_qp ucmd; int inlen = sizeof(*in); int err; + gen = &dev->mdev->caps.gen; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); return -EINVAL; } else { @@ -851,9 +859,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "invalid rq params\n"); return -EINVAL; } - if (ucmd.sq_wqe_count > dev->mdev->caps.max_wqes) { + if (ucmd.sq_wqe_count > gen->max_wqes) { mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", - ucmd.sq_wqe_count, dev->mdev->caps.max_wqes); + ucmd.sq_wqe_count, gen->max_wqes); return -EINVAL; } err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); @@ -1144,6 +1152,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { + struct mlx5_general_caps *gen; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; @@ -1161,11 +1170,12 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } + gen = &dev->mdev->caps.gen; switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC)) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) { mlx5_ib_dbg(dev, "XRC not supported\n"); return ERR_PTR(-ENOSYS); } @@ -1272,6 +1282,9 @@ enum { static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) { + struct mlx5_general_caps *gen; + + gen = &dev->mdev->caps.gen; if (rate == IB_RATE_PORT_CURRENT) { return 0; } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { @@ -1279,7 +1292,7 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) } else { while (rate != IB_RATE_2_5_GBPS && !(1 << (rate + MLX5_STAT_RATE_OFFSET) & - dev->mdev->caps.stat_rate_support)) + gen->stat_rate_support)) --rate; } @@ -1290,8 +1303,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr) { + struct mlx5_general_caps *gen; int err; + gen = &dev->mdev->caps.gen; path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; @@ -1318,9 +1333,9 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, path->port = port; if (ah->ah_flags & IB_AH_GRH) { - if (ah->grh.sgid_index >= dev->mdev->caps.port[port - 1].gid_table_len) { + if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) { pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n", - ah->grh.sgid_index, dev->mdev->caps.port[port - 1].gid_table_len); + ah->grh.sgid_index, gen->port[port - 1].gid_table_len); return -EINVAL; } @@ -1492,6 +1507,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; + struct mlx5_general_caps *gen; struct mlx5_modify_qp_mbox_in *in; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; @@ -1500,6 +1516,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, int mlx5_st; int err; + gen = &dev->mdev->caps.gen; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return -ENOMEM; @@ -1539,7 +1556,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = -EINVAL; goto out; } - context->mtu_msgmax = (attr->path_mtu << 5) | dev->mdev->caps.log_max_msg; + context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg; } if (attr_mask & IB_QP_DEST_QPN) @@ -1685,9 +1702,11 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; + struct mlx5_general_caps *gen; int err = -EINVAL; int port; + gen = &dev->mdev->caps.gen; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -1699,21 +1718,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; if ((attr_mask & IB_QP_PORT) && - (attr->port_num == 0 || attr->port_num > dev->mdev->caps.num_ports)) + (attr->port_num == 0 || attr->port_num > gen->num_ports)) goto out; if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - if (attr->pkey_index >= dev->mdev->caps.port[port - 1].pkey_table_len) + if (attr->pkey_index >= gen->port[port - 1].pkey_table_len) goto out; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && - attr->max_rd_atomic > dev->mdev->caps.max_ra_res_qp) + attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp)) goto out; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && - attr->max_dest_rd_atomic > dev->mdev->caps.max_ra_req_qp) + attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp)) goto out; if (cur_state == new_state && cur_state == IB_QPS_RESET) { @@ -2893,7 +2912,8 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); ib_ah_attr->port_num = path->port; - if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > dev->caps.gen.num_ports) return; ib_ah_attr->sl = path->sl & 0xf; @@ -3011,10 +3031,12 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_general_caps *gen; struct mlx5_ib_xrcd *xrcd; int err; - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC)) + gen = &dev->mdev->caps.gen; + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 70bd131ba646..97cc1baaa8e3 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -238,6 +238,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_general_caps *gen; struct mlx5_ib_srq *srq; int desc_size; int buf_size; @@ -247,11 +248,12 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, int is_xrc; u32 flgs, xrcdn; + gen = &dev->mdev->caps.gen; /* Sanity check SRQ size before proceeding */ - if (init_attr->attr.max_wr >= dev->mdev->caps.max_srq_wqes) { + if (init_attr->attr.max_wr >= gen->max_srq_wqes) { mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", init_attr->attr.max_wr, - dev->mdev->caps.max_srq_wqes); + gen->max_srq_wqes); return ERR_PTR(-EINVAL); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 65a7da69e2ac..6eb0f85cf872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1538,16 +1538,9 @@ static const char *cmd_status_str(u8 status) } } -int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) +static int cmd_status_to_err(u8 status) { - if (!hdr->status) - return 0; - - pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", - cmd_status_str(hdr->status), hdr->status, - be32_to_cpu(hdr->syndrome)); - - switch (hdr->status) { + switch (status) { case MLX5_CMD_STAT_OK: return 0; case MLX5_CMD_STAT_INT_ERR: return -EIO; case MLX5_CMD_STAT_BAD_OP_ERR: return -EINVAL; @@ -1567,3 +1560,16 @@ int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) default: return -EIO; } } + +/* this will be available till all the commands use set/get macros */ +int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) +{ + if (!hdr->status) + return 0; + + pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", + cmd_status_str(hdr->status), hdr->status, + be32_to_cpu(hdr->syndrome)); + + return cmd_status_to_err(hdr->status); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 4e8bd0b34bb0..11b9b840ad4d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -468,7 +468,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) err = mlx5_create_map_eq(dev, &table->pages_eq, MLX5_EQ_VEC_PAGES, - dev->caps.max_vf + 1, + dev->caps.gen.max_vf + 1, 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq", &dev->priv.uuari.uars[0]); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index f012658b6a92..087c4c797deb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -64,86 +64,9 @@ out_out: return err; } -int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, - struct mlx5_caps *caps) +int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) { - struct mlx5_cmd_query_hca_cap_mbox_out *out; - struct mlx5_cmd_query_hca_cap_mbox_in in; - struct mlx5_query_special_ctxs_mbox_out ctx_out; - struct mlx5_query_special_ctxs_mbox_in ctx_in; - int err; - u16 t16; - - out = kzalloc(sizeof(*out), GFP_KERNEL); - if (!out) - return -ENOMEM; - - memset(&in, 0, sizeof(in)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - in.hdr.opmod = cpu_to_be16(0x1); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); - if (err) - goto out_out; - - if (out->hdr.status) { - err = mlx5_cmd_status_to_err(&out->hdr); - goto out_out; - } - - - caps->log_max_eq = out->hca_cap.log_max_eq & 0xf; - caps->max_cqes = 1 << out->hca_cap.log_max_cq_sz; - caps->max_wqes = 1 << out->hca_cap.log_max_qp_sz; - caps->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq); - caps->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq); - caps->flags = be64_to_cpu(out->hca_cap.flags); - caps->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support); - caps->log_max_msg = out->hca_cap.log_max_msg & 0x1f; - caps->num_ports = out->hca_cap.num_ports & 0xf; - caps->log_max_cq = out->hca_cap.log_max_cq & 0x1f; - if (caps->num_ports > MLX5_MAX_PORTS) { - mlx5_core_err(dev, "device has %d ports while the driver supports max %d ports\n", - caps->num_ports, MLX5_MAX_PORTS); - err = -EINVAL; - goto out_out; - } - caps->log_max_qp = out->hca_cap.log_max_qp & 0x1f; - caps->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f; - caps->log_max_pd = out->hca_cap.log_max_pd & 0x1f; - caps->log_max_srq = out->hca_cap.log_max_srqs & 0x1f; - caps->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f; - caps->log_max_mcg = out->hca_cap.log_max_mcg; - caps->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff; - caps->max_ra_res_qp = 1 << (out->hca_cap.log_max_ra_res_qp & 0x3f); - caps->max_ra_req_qp = 1 << (out->hca_cap.log_max_ra_req_qp & 0x3f); - caps->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz; - t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size); - if (t16 & 0x8000) { - caps->bf_reg_size = 1 << (t16 & 0x1f); - caps->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE; - } else { - caps->bf_reg_size = 0; - caps->bf_regs_per_page = 0; - } - caps->min_page_sz = ~(u32)((1 << out->hca_cap.log_pg_sz) - 1); - - memset(&ctx_in, 0, sizeof(ctx_in)); - memset(&ctx_out, 0, sizeof(ctx_out)); - ctx_in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, &ctx_in, sizeof(ctx_in), - &ctx_out, sizeof(ctx_out)); - if (err) - goto out_out; - - if (ctx_out.hdr.status) - err = mlx5_cmd_status_to_err(&ctx_out.hdr); - - caps->reserved_lkey = be32_to_cpu(ctx_out.reserved_lkey); - -out_out: - kfree(out); - - return err; + return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); } int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index f2716cc1f51d..d9f74618befa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -207,11 +207,11 @@ static void release_bar(struct pci_dev *pdev) static int mlx5_enable_msix(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; - int num_eqs = 1 << dev->caps.log_max_eq; + int num_eqs = 1 << dev->caps.gen.log_max_eq; int nvec; int i; - nvec = dev->caps.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; + nvec = dev->caps.gen.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; nvec = min_t(int, nvec, num_eqs); if (nvec <= MLX5_EQ_VEC_COMP_BASE) return -ENOMEM; @@ -250,13 +250,34 @@ struct mlx5_reg_host_endianess { #define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) enum { - MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | - CAP_MASK(MLX5_CAP_OFF_DCT, 1), + MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | + MLX5_DEV_CAP_FLAG_DCT, }; +static u16 to_fw_pkey_sz(u32 size) +{ + switch (size) { + case 128: + return 0; + case 256: + return 1; + case 512: + return 2; + case 1024: + return 3; + case 2048: + return 4; + case 4096: + return 5; + default: + pr_warn("invalid pkey table size %d\n", size); + return 0; + } +} + /* selectively copy writable fields clearing any reserved area */ -static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from) +static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_general_caps *from) { u64 v64; @@ -265,76 +286,172 @@ static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from) to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f; to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f; to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f; - to->log_max_atomic_size_qp = from->log_max_atomic_size_qp; - to->log_max_atomic_size_dc = from->log_max_atomic_size_dc; - v64 = be64_to_cpu(from->flags) & MLX5_CAP_BITS_RW_MASK; + to->pkey_table_size = cpu_to_be16(to_fw_pkey_sz(from->pkey_table_size)); + v64 = from->flags & MLX5_CAP_BITS_RW_MASK; to->flags = cpu_to_be64(v64); } -enum { - HCA_CAP_OPMOD_GET_MAX = 0, - HCA_CAP_OPMOD_GET_CUR = 1, -}; +static u16 get_pkey_table_size(int pkey) +{ + if (pkey > MLX5_MAX_LOG_PKEY_TABLE) + return 0; -static int handle_hca_cap(struct mlx5_core_dev *dev) + return MLX5_MIN_PKEY_TABLE_SIZE << pkey; +} + +static void fw2drv_caps(struct mlx5_caps *caps, + struct mlx5_cmd_query_hca_cap_mbox_out *out) { - struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL; - struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; - struct mlx5_cmd_query_hca_cap_mbox_in query_ctx; - struct mlx5_cmd_set_hca_cap_mbox_out set_out; - u64 flags; + struct mlx5_general_caps *gen = &caps->gen; + u16 t16; + + gen->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz; + gen->max_wqes = 1 << out->hca_cap.log_max_qp_sz; + gen->log_max_qp = out->hca_cap.log_max_qp & 0x1f; + gen->log_max_strq = out->hca_cap.log_max_strq_sz; + gen->log_max_srq = out->hca_cap.log_max_srqs & 0x1f; + gen->max_cqes = 1 << out->hca_cap.log_max_cq_sz; + gen->log_max_cq = out->hca_cap.log_max_cq & 0x1f; + gen->max_eqes = out->hca_cap.log_max_eq_sz; + gen->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f; + gen->log_max_eq = out->hca_cap.log_max_eq & 0xf; + gen->max_indirection = out->hca_cap.max_indirection; + gen->log_max_mrw_sz = out->hca_cap.log_max_mrw_sz; + gen->log_max_bsf_list_size = 0; + gen->log_max_klm_list_size = 0; + gen->log_max_ra_req_dc = out->hca_cap.log_max_ra_req_dc; + gen->log_max_ra_res_dc = out->hca_cap.log_max_ra_res_dc; + gen->log_max_ra_req_qp = out->hca_cap.log_max_ra_req_qp; + gen->log_max_ra_res_qp = out->hca_cap.log_max_ra_res_qp; + gen->max_qp_counters = be16_to_cpu(out->hca_cap.max_qp_count); + gen->pkey_table_size = get_pkey_table_size(be16_to_cpu(out->hca_cap.pkey_table_size)); + gen->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f; + gen->num_ports = out->hca_cap.num_ports & 0xf; + gen->log_max_msg = out->hca_cap.log_max_msg & 0x1f; + gen->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support); + gen->flags = be64_to_cpu(out->hca_cap.flags); + pr_debug("flags = 0x%llx\n", gen->flags); + gen->uar_sz = out->hca_cap.uar_sz; + gen->min_log_pg_sz = out->hca_cap.log_pg_sz; + + t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size); + if (t16 & 0x8000) { + gen->bf_reg_size = 1 << (t16 & 0x1f); + gen->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE; + } else { + gen->bf_reg_size = 0; + gen->bf_regs_per_page = 0; + } + gen->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq); + gen->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq); + gen->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff; + gen->log_max_pd = out->hca_cap.log_max_pd & 0x1f; + gen->log_max_xrcd = out->hca_cap.log_max_xrcd; + gen->log_uar_page_sz = be16_to_cpu(out->hca_cap.log_uar_page_sz); +} + +static const char *caps_opmod_str(u16 opmod) +{ + switch (opmod) { + case HCA_CAP_OPMOD_GET_MAX: + return "GET_MAX"; + case HCA_CAP_OPMOD_GET_CUR: + return "GET_CUR"; + default: + return "Invalid"; + } +} + +int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, + u16 opmod) +{ + struct mlx5_cmd_query_hca_cap_mbox_out *out; + struct mlx5_cmd_query_hca_cap_mbox_in in; int err; - memset(&query_ctx, 0, sizeof(query_ctx)); - query_out = kzalloc(sizeof(*query_out), GFP_KERNEL); - if (!query_out) + memset(&in, 0, sizeof(in)); + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) return -ENOMEM; - set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL); - if (!set_ctx) { - err = -ENOMEM; + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); + in.hdr.opmod = cpu_to_be16(opmod); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); + + err = mlx5_cmd_status_to_err(&out->hdr); + if (err) { + mlx5_core_warn(dev, "query max hca cap failed, %d\n", err); goto query_ex; } + mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod)); + fw2drv_caps(caps, out); - query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - query_ctx.hdr.opmod = cpu_to_be16(HCA_CAP_OPMOD_GET_CUR); - err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx), - query_out, sizeof(*query_out)); +query_ex: + kfree(out); + return err; +} + +static int set_caps(struct mlx5_core_dev *dev, + struct mlx5_cmd_set_hca_cap_mbox_in *in) +{ + struct mlx5_cmd_set_hca_cap_mbox_out out; + int err; + + memset(&out, 0, sizeof(out)); + + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); + err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); if (err) - goto query_ex; + return err; - err = mlx5_cmd_status_to_err(&query_out->hdr); - if (err) { - mlx5_core_warn(dev, "query hca cap failed, %d\n", err); + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} + +static int handle_hca_cap(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; + struct mlx5_profile *prof = dev->profile; + struct mlx5_caps *cur_caps = NULL; + struct mlx5_caps *max_caps = NULL; + int err = -ENOMEM; + + set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL); + if (!set_ctx) goto query_ex; - } - copy_rw_fields(&set_ctx->hca_cap, &query_out->hca_cap); + max_caps = kzalloc(sizeof(*max_caps), GFP_KERNEL); + if (!max_caps) + goto query_ex; - if (dev->profile && dev->profile->mask & MLX5_PROF_MASK_QP_SIZE) - set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp; + cur_caps = kzalloc(sizeof(*cur_caps), GFP_KERNEL); + if (!cur_caps) + goto query_ex; - flags = be64_to_cpu(query_out->hca_cap.flags); - /* disable checksum */ - flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; - - set_ctx->hca_cap.flags = cpu_to_be64(flags); - memset(&set_out, 0, sizeof(set_out)); - set_ctx->hca_cap.log_uar_page_sz = cpu_to_be16(PAGE_SHIFT - 12); - set_ctx->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); - err = mlx5_cmd_exec(dev, set_ctx, sizeof(*set_ctx), - &set_out, sizeof(set_out)); - if (err) { - mlx5_core_warn(dev, "set hca cap failed, %d\n", err); + err = mlx5_core_get_caps(dev, max_caps, HCA_CAP_OPMOD_GET_MAX); + if (err) goto query_ex; - } - err = mlx5_cmd_status_to_err(&set_out.hdr); + err = mlx5_core_get_caps(dev, cur_caps, HCA_CAP_OPMOD_GET_CUR); if (err) goto query_ex; + /* we limit the size of the pkey table to 128 entries for now */ + cur_caps->gen.pkey_table_size = 128; + + if (prof->mask & MLX5_PROF_MASK_QP_SIZE) + cur_caps->gen.log_max_qp = prof->log_max_qp; + + /* disable checksum */ + cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; + + copy_rw_fields(&set_ctx->hca_cap, &cur_caps->gen); + err = set_caps(dev, set_ctx); + query_ex: - kfree(query_out); + kfree(cur_caps); + kfree(max_caps); kfree(set_ctx); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 68f5d9c77c7b..0a6348cefc01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -174,11 +174,11 @@ int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari) for (i = 0; i < tot_uuars; i++) { bf = &uuari->bfs[i]; - bf->buf_size = dev->caps.bf_reg_size / 2; + bf->buf_size = dev->caps.gen.bf_reg_size / 2; bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE]; bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map; bf->reg = NULL; /* Add WC support */ - bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.bf_reg_size + + bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.gen.bf_reg_size + MLX5_BF_OFFSET; bf->need_lock = need_uuar_lock(i); spin_lock_init(&bf->lock); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 334947151dfc..dce01fd854a8 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -70,6 +70,11 @@ enum { MLX5_INLINE_SEG = 0x80000000, }; +enum { + MLX5_MIN_PKEY_TABLE_SIZE = 128, + MLX5_MAX_LOG_PKEY_TABLE = 5, +}; + enum { MLX5_PERM_LOCAL_READ = 1 << 2, MLX5_PERM_LOCAL_WRITE = 1 << 3, @@ -184,10 +189,10 @@ enum { MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, MLX5_DEV_CAP_FLAG_RESIZE_SRQ = 1LL << 32, + MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, MLX5_DEV_CAP_FLAG_REMOTE_FENCE = 1LL << 38, MLX5_DEV_CAP_FLAG_TLP_HINTS = 1LL << 39, MLX5_DEV_CAP_FLAG_SIG_HAND_OVER = 1LL << 40, - MLX5_DEV_CAP_FLAG_DCT = 1LL << 41, MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 3LL << 46, }; @@ -243,10 +248,14 @@ enum { }; enum { - MLX5_CAP_OFF_DCT = 41, MLX5_CAP_OFF_CMDIF_CSUM = 46, }; +enum { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; @@ -303,9 +312,10 @@ struct mlx5_hca_cap { u8 log_max_ra_req_qp; u8 rsvd10; u8 log_max_ra_res_qp; - u8 rsvd11[4]; + u8 pad_cap; + u8 rsvd11[3]; __be16 max_qp_count; - __be16 rsvd12; + __be16 pkey_table_size; u8 rsvd13; u8 local_ca_ack_delay; u8 rsvd14; @@ -335,11 +345,7 @@ struct mlx5_hca_cap { u8 log_max_xrcd; u8 rsvd25[42]; __be16 log_uar_page_sz; - u8 rsvd26[28]; - u8 log_max_atomic_size_qp; - u8 rsvd27[2]; - u8 log_max_atomic_size_dc; - u8 rsvd28[76]; + u8 rsvd26[108]; }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b88e9b46d957..45a2add747e0 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -335,23 +335,30 @@ struct mlx5_port_caps { int pkey_table_len; }; -struct mlx5_caps { +struct mlx5_general_caps { u8 log_max_eq; u8 log_max_cq; u8 log_max_qp; u8 log_max_mkey; u8 log_max_pd; u8 log_max_srq; + u8 log_max_strq; + u8 log_max_mrw_sz; + u8 log_max_bsf_list_size; + u8 log_max_klm_list_size; u32 max_cqes; int max_wqes; + u32 max_eqes; + u32 max_indirection; int max_sq_desc_sz; int max_rq_desc_sz; + int max_dc_sq_desc_sz; u64 flags; u16 stat_rate_support; int log_max_msg; int num_ports; - int max_ra_res_qp; - int max_ra_req_qp; + u8 log_max_ra_res_qp; + u8 log_max_ra_req_qp; int max_srq_wqes; int bf_reg_size; int bf_regs_per_page; @@ -363,6 +370,19 @@ struct mlx5_caps { u8 log_max_mcg; u32 max_qp_mcg; int min_page_sz; + int pd_cap; + u32 max_qp_counters; + u32 pkey_table_size; + u8 log_max_ra_req_dc; + u8 log_max_ra_res_dc; + u32 uar_sz; + u8 min_log_pg_sz; + u8 log_max_xrcd; + u16 log_uar_page_sz; +}; + +struct mlx5_caps { + struct mlx5_general_caps gen; }; struct mlx5_cmd_mailbox { @@ -695,6 +715,8 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); +int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, + u16 opmod); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, -- cgit v1.2.3 From d29b796adada8780db3512c4a34b339f9aeef1ae Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:43 +0300 Subject: net/mlx5_core: Use hardware registers description header file Add an auto generated header file that describes hardware registers along with set of macros that set/get values. The macros do static checks to avoid overflow, handle endianess, and overall provide a clean way to code commands. Currently the header file is small and we will add structs as we make use of the macros. A few commands were removed from the commands enum since they are not supported currently and will be added when support is available. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 36 ------- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 3 - include/linux/mlx5/device.h | 44 ++++++++ include/linux/mlx5/driver.h | 76 +------------- include/linux/mlx5/mlx5_ifc.h | 143 ++++++++++++++++++++++++++ 5 files changed, 188 insertions(+), 114 deletions(-) create mode 100644 include/linux/mlx5/mlx5_ifc.h (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 6eb0f85cf872..3ecef1310bae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -357,60 +357,24 @@ const char *mlx5_command_str(int command) case MLX5_CMD_OP_2ERR_QP: return "2ERR_QP"; - case MLX5_CMD_OP_RTS2SQD_QP: - return "RTS2SQD_QP"; - - case MLX5_CMD_OP_SQD2RTS_QP: - return "SQD2RTS_QP"; - case MLX5_CMD_OP_2RST_QP: return "2RST_QP"; case MLX5_CMD_OP_QUERY_QP: return "QUERY_QP"; - case MLX5_CMD_OP_CONF_SQP: - return "CONF_SQP"; - case MLX5_CMD_OP_MAD_IFC: return "MAD_IFC"; case MLX5_CMD_OP_INIT2INIT_QP: return "INIT2INIT_QP"; - case MLX5_CMD_OP_SUSPEND_QP: - return "SUSPEND_QP"; - - case MLX5_CMD_OP_UNSUSPEND_QP: - return "UNSUSPEND_QP"; - - case MLX5_CMD_OP_SQD2SQD_QP: - return "SQD2SQD_QP"; - - case MLX5_CMD_OP_ALLOC_QP_COUNTER_SET: - return "ALLOC_QP_COUNTER_SET"; - - case MLX5_CMD_OP_DEALLOC_QP_COUNTER_SET: - return "DEALLOC_QP_COUNTER_SET"; - - case MLX5_CMD_OP_QUERY_QP_COUNTER_SET: - return "QUERY_QP_COUNTER_SET"; - case MLX5_CMD_OP_CREATE_PSV: return "CREATE_PSV"; case MLX5_CMD_OP_DESTROY_PSV: return "DESTROY_PSV"; - case MLX5_CMD_OP_QUERY_PSV: - return "QUERY_PSV"; - - case MLX5_CMD_OP_QUERY_SIG_RULE_TABLE: - return "QUERY_SIG_RULE_TABLE"; - - case MLX5_CMD_OP_QUERY_BLOCK_SIZE_TABLE: - return "QUERY_BLOCK_SIZE_TABLE"; - case MLX5_CMD_OP_CREATE_SRQ: return "CREATE_SRQ"; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 8145b4668229..415b67ce379e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -184,13 +184,10 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, - [MLX5_QP_STATE_SQD] = MLX5_CMD_OP_RTS2SQD_QP, }, [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD2RTS_QP, - [MLX5_QP_STATE_SQD] = MLX5_CMD_OP_SQD2SQD_QP, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index dce01fd854a8..0032687f58c7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -44,6 +44,50 @@ #error Host endianness not defined #endif +/* helper macros */ +#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0) +#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld) +#define __mlx5_bit_off(typ, fld) ((unsigned)(unsigned long)(&(__mlx5_nullp(typ)->fld))) +#define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32) +#define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64) +#define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f)) +#define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) +#define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld)) +#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits) + +#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8) +#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8) +#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32) +#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8) +#define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld)) + +/* insert a value to a struct */ +#define MLX5_SET(typ, p, fld, v) do { \ + BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \ + *((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \ + cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \ + (~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, fld)) \ + << __mlx5_dw_bit_off(typ, fld))); \ +} while (0) + +#define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\ +__mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \ +__mlx5_mask(typ, fld)) + +#define MLX5_GET_PR(typ, p, fld) ({ \ + u32 ___t = MLX5_GET(typ, p, fld); \ + pr_debug(#fld " = 0x%x\n", ___t); \ + ___t; \ +}) + +#define MLX5_SET64(typ, p, fld, v) do { \ + BUILD_BUG_ON(__mlx5_bit_sz(typ, fld) != 64); \ + BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \ + *((__be64 *)(p) + __mlx5_64_off(typ, fld)) = cpu_to_be64(v); \ +} while (0) + +#define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld))) + enum { MLX5_MAX_COMMANDS = 32, MLX5_CMD_DATA_BLOCK_SIZE = 512, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 45a2add747e0..6f48dc793b9f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -44,6 +44,7 @@ #include #include +#include enum { MLX5_BOARD_ID_LEN = 64, @@ -98,81 +99,6 @@ enum { MLX5_ATOMIC_MODE_256B = 8 << 16, }; -enum { - MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, - MLX5_CMD_OP_QUERY_ADAPTER = 0x101, - MLX5_CMD_OP_INIT_HCA = 0x102, - MLX5_CMD_OP_TEARDOWN_HCA = 0x103, - MLX5_CMD_OP_ENABLE_HCA = 0x104, - MLX5_CMD_OP_DISABLE_HCA = 0x105, - MLX5_CMD_OP_QUERY_PAGES = 0x107, - MLX5_CMD_OP_MANAGE_PAGES = 0x108, - MLX5_CMD_OP_SET_HCA_CAP = 0x109, - - MLX5_CMD_OP_CREATE_MKEY = 0x200, - MLX5_CMD_OP_QUERY_MKEY = 0x201, - MLX5_CMD_OP_DESTROY_MKEY = 0x202, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS = 0x203, - - MLX5_CMD_OP_CREATE_EQ = 0x301, - MLX5_CMD_OP_DESTROY_EQ = 0x302, - MLX5_CMD_OP_QUERY_EQ = 0x303, - - MLX5_CMD_OP_CREATE_CQ = 0x400, - MLX5_CMD_OP_DESTROY_CQ = 0x401, - MLX5_CMD_OP_QUERY_CQ = 0x402, - MLX5_CMD_OP_MODIFY_CQ = 0x403, - - MLX5_CMD_OP_CREATE_QP = 0x500, - MLX5_CMD_OP_DESTROY_QP = 0x501, - MLX5_CMD_OP_RST2INIT_QP = 0x502, - MLX5_CMD_OP_INIT2RTR_QP = 0x503, - MLX5_CMD_OP_RTR2RTS_QP = 0x504, - MLX5_CMD_OP_RTS2RTS_QP = 0x505, - MLX5_CMD_OP_SQERR2RTS_QP = 0x506, - MLX5_CMD_OP_2ERR_QP = 0x507, - MLX5_CMD_OP_RTS2SQD_QP = 0x508, - MLX5_CMD_OP_SQD2RTS_QP = 0x509, - MLX5_CMD_OP_2RST_QP = 0x50a, - MLX5_CMD_OP_QUERY_QP = 0x50b, - MLX5_CMD_OP_CONF_SQP = 0x50c, - MLX5_CMD_OP_MAD_IFC = 0x50d, - MLX5_CMD_OP_INIT2INIT_QP = 0x50e, - MLX5_CMD_OP_SUSPEND_QP = 0x50f, - MLX5_CMD_OP_UNSUSPEND_QP = 0x510, - MLX5_CMD_OP_SQD2SQD_QP = 0x511, - MLX5_CMD_OP_ALLOC_QP_COUNTER_SET = 0x512, - MLX5_CMD_OP_DEALLOC_QP_COUNTER_SET = 0x513, - MLX5_CMD_OP_QUERY_QP_COUNTER_SET = 0x514, - - MLX5_CMD_OP_CREATE_PSV = 0x600, - MLX5_CMD_OP_DESTROY_PSV = 0x601, - MLX5_CMD_OP_QUERY_PSV = 0x602, - MLX5_CMD_OP_QUERY_SIG_RULE_TABLE = 0x603, - MLX5_CMD_OP_QUERY_BLOCK_SIZE_TABLE = 0x604, - - MLX5_CMD_OP_CREATE_SRQ = 0x700, - MLX5_CMD_OP_DESTROY_SRQ = 0x701, - MLX5_CMD_OP_QUERY_SRQ = 0x702, - MLX5_CMD_OP_ARM_RQ = 0x703, - MLX5_CMD_OP_RESIZE_SRQ = 0x704, - - MLX5_CMD_OP_ALLOC_PD = 0x800, - MLX5_CMD_OP_DEALLOC_PD = 0x801, - MLX5_CMD_OP_ALLOC_UAR = 0x802, - MLX5_CMD_OP_DEALLOC_UAR = 0x803, - - MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, - MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, - - - MLX5_CMD_OP_ALLOC_XRCD = 0x80e, - MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, - - MLX5_CMD_OP_ACCESS_REG = 0x805, - MLX5_CMD_OP_MAX = 0x810, -}; - enum { MLX5_REG_PCAP = 0x5001, MLX5_REG_PMTU = 0x5003, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h new file mode 100644 index 000000000000..df3bd9b5fbcf --- /dev/null +++ b/include/linux/mlx5/mlx5_ifc.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2014, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IFC_H +#define MLX5_IFC_H + +enum { + MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, + MLX5_CMD_OP_QUERY_ADAPTER = 0x101, + MLX5_CMD_OP_INIT_HCA = 0x102, + MLX5_CMD_OP_TEARDOWN_HCA = 0x103, + MLX5_CMD_OP_ENABLE_HCA = 0x104, + MLX5_CMD_OP_DISABLE_HCA = 0x105, + MLX5_CMD_OP_QUERY_PAGES = 0x107, + MLX5_CMD_OP_MANAGE_PAGES = 0x108, + MLX5_CMD_OP_SET_HCA_CAP = 0x109, + MLX5_CMD_OP_CREATE_MKEY = 0x200, + MLX5_CMD_OP_QUERY_MKEY = 0x201, + MLX5_CMD_OP_DESTROY_MKEY = 0x202, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS = 0x203, + MLX5_CMD_OP_PAGE_FAULT_RESUME = 0x204, + MLX5_CMD_OP_CREATE_EQ = 0x301, + MLX5_CMD_OP_DESTROY_EQ = 0x302, + MLX5_CMD_OP_QUERY_EQ = 0x303, + MLX5_CMD_OP_GEN_EQE = 0x304, + MLX5_CMD_OP_CREATE_CQ = 0x400, + MLX5_CMD_OP_DESTROY_CQ = 0x401, + MLX5_CMD_OP_QUERY_CQ = 0x402, + MLX5_CMD_OP_MODIFY_CQ = 0x403, + MLX5_CMD_OP_CREATE_QP = 0x500, + MLX5_CMD_OP_DESTROY_QP = 0x501, + MLX5_CMD_OP_RST2INIT_QP = 0x502, + MLX5_CMD_OP_INIT2RTR_QP = 0x503, + MLX5_CMD_OP_RTR2RTS_QP = 0x504, + MLX5_CMD_OP_RTS2RTS_QP = 0x505, + MLX5_CMD_OP_SQERR2RTS_QP = 0x506, + MLX5_CMD_OP_2ERR_QP = 0x507, + MLX5_CMD_OP_2RST_QP = 0x50a, + MLX5_CMD_OP_QUERY_QP = 0x50b, + MLX5_CMD_OP_INIT2INIT_QP = 0x50e, + MLX5_CMD_OP_CREATE_PSV = 0x600, + MLX5_CMD_OP_DESTROY_PSV = 0x601, + MLX5_CMD_OP_CREATE_SRQ = 0x700, + MLX5_CMD_OP_DESTROY_SRQ = 0x701, + MLX5_CMD_OP_QUERY_SRQ = 0x702, + MLX5_CMD_OP_ARM_RQ = 0x703, + MLX5_CMD_OP_RESIZE_SRQ = 0x704, + MLX5_CMD_OP_CREATE_DCT = 0x710, + MLX5_CMD_OP_DESTROY_DCT = 0x711, + MLX5_CMD_OP_DRAIN_DCT = 0x712, + MLX5_CMD_OP_QUERY_DCT = 0x713, + MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION = 0x714, + MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, + MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, + MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT = 0x753, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755, + MLX5_CMD_OP_QUERY_RCOE_ADDRESS = 0x760, + MLX5_CMD_OP_SET_ROCE_ADDRESS = 0x761, + MLX5_CMD_OP_QUERY_VPORT_COUNTER = 0x770, + MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, + MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, + MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, + MLX5_CMD_OP_ALLOC_PD = 0x800, + MLX5_CMD_OP_DEALLOC_PD = 0x801, + MLX5_CMD_OP_ALLOC_UAR = 0x802, + MLX5_CMD_OP_DEALLOC_UAR = 0x803, + MLX5_CMD_OP_CONFIG_INT_MODERATION = 0x804, + MLX5_CMD_OP_ACCESS_REG = 0x805, + MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, + MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, + MLX5_CMD_OP_GET_DROPPED_PACKET_LOG = 0x80a, + MLX5_CMD_OP_MAD_IFC = 0x50d, + MLX5_CMD_OP_QUERY_MAD_DEMUX = 0x80b, + MLX5_CMD_OP_SET_MAD_DEMUX = 0x80c, + MLX5_CMD_OP_NOP = 0x80d, + MLX5_CMD_OP_ALLOC_XRCD = 0x80e, + MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, + MLX5_CMD_OP_SET_BURST_SIZE = 0x812, + MLX5_CMD_OP_QUERY_BURST_SZIE = 0x813, + MLX5_CMD_OP_ACTIVATE_TRACER = 0x814, + MLX5_CMD_OP_DEACTIVATE_TRACER = 0x815, + MLX5_CMD_OP_CREATE_SNIFFER_RULE = 0x820, + MLX5_CMD_OP_DESTROY_SNIFFER_RULE = 0x821, + MLX5_CMD_OP_QUERY_CONG_PARAMS = 0x822, + MLX5_CMD_OP_MODIFY_CONG_PARAMS = 0x823, + MLX5_CMD_OP_QUERY_CONG_STATISTICS = 0x824, + MLX5_CMD_OP_CREATE_TIR = 0x900, + MLX5_CMD_OP_MODIFY_TIR = 0x901, + MLX5_CMD_OP_DESTROY_TIR = 0x902, + MLX5_CMD_OP_QUERY_TIR = 0x903, + MLX5_CMD_OP_CREATE_TIS = 0x912, + MLX5_CMD_OP_MODIFY_TIS = 0x913, + MLX5_CMD_OP_DESTROY_TIS = 0x914, + MLX5_CMD_OP_QUERY_TIS = 0x915, + MLX5_CMD_OP_CREATE_SQ = 0x904, + MLX5_CMD_OP_MODIFY_SQ = 0x905, + MLX5_CMD_OP_DESTROY_SQ = 0x906, + MLX5_CMD_OP_QUERY_SQ = 0x907, + MLX5_CMD_OP_CREATE_RQ = 0x908, + MLX5_CMD_OP_MODIFY_RQ = 0x909, + MLX5_CMD_OP_DESTROY_RQ = 0x90a, + MLX5_CMD_OP_QUERY_RQ = 0x90b, + MLX5_CMD_OP_CREATE_RMP = 0x90c, + MLX5_CMD_OP_MODIFY_RMP = 0x90d, + MLX5_CMD_OP_DESTROY_RMP = 0x90e, + MLX5_CMD_OP_QUERY_RMP = 0x90f, + MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x910, + MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x911, + MLX5_CMD_OP_MAX = 0x911 +}; + +#endif /* MLX5_IFC_H */ -- cgit v1.2.3 From b775516b042f9e35f856bd2914afefd9d23021d7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:44 +0300 Subject: net/mlx5_core: use set/get macros in device caps Transform device capabilities related commands to use set/get macros to manipulate command mailboxes. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 17 ++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 150 +++++++++--------- include/linux/mlx5/device.h | 92 ----------- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 206 +++++++++++++++++++++++++ 5 files changed, 298 insertions(+), 168 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 3ecef1310bae..368c6c5ea014 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1537,3 +1537,20 @@ int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) return cmd_status_to_err(hdr->status); } + +int mlx5_cmd_status_to_err_v2(void *ptr) +{ + u32 syndrome; + u8 status; + + status = be32_to_cpu(*(__be32 *)ptr) >> 24; + if (!status) + return 0; + + syndrome = be32_to_cpu(*(__be32 *)(ptr + 4)); + + pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", + cmd_status_str(status), status, syndrome); + + return cmd_status_to_err(status); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index d9f74618befa..b9e3259e415f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "mlx5_core.h" #define DRIVER_NAME "mlx5_core" @@ -277,18 +278,20 @@ static u16 to_fw_pkey_sz(u32 size) /* selectively copy writable fields clearing any reserved area */ -static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_general_caps *from) +static void copy_rw_fields(void *to, struct mlx5_caps *from) { + __be64 *flags_off = (__be64 *)MLX5_ADDR_OF(cmd_hca_cap, to, reserved_22); u64 v64; - to->log_max_qp = from->log_max_qp & 0x1f; - to->log_max_ra_req_dc = from->log_max_ra_req_dc & 0x3f; - to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f; - to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f; - to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f; - to->pkey_table_size = cpu_to_be16(to_fw_pkey_sz(from->pkey_table_size)); - v64 = from->flags & MLX5_CAP_BITS_RW_MASK; - to->flags = cpu_to_be64(v64); + MLX5_SET(cmd_hca_cap, to, log_max_qp, from->gen.log_max_qp); + MLX5_SET(cmd_hca_cap, to, log_max_ra_req_qp, from->gen.log_max_ra_req_qp); + MLX5_SET(cmd_hca_cap, to, log_max_ra_res_qp, from->gen.log_max_ra_res_qp); + MLX5_SET(cmd_hca_cap, to, pkey_table_size, from->gen.pkey_table_size); + MLX5_SET(cmd_hca_cap, to, log_max_ra_req_dc, from->gen.log_max_ra_req_dc); + MLX5_SET(cmd_hca_cap, to, log_max_ra_res_dc, from->gen.log_max_ra_res_dc); + MLX5_SET(cmd_hca_cap, to, pkey_table_size, to_fw_pkey_sz(from->gen.pkey_table_size)); + v64 = from->gen.flags & MLX5_CAP_BITS_RW_MASK; + *flags_off = cpu_to_be64(v64); } static u16 get_pkey_table_size(int pkey) @@ -299,55 +302,47 @@ static u16 get_pkey_table_size(int pkey) return MLX5_MIN_PKEY_TABLE_SIZE << pkey; } -static void fw2drv_caps(struct mlx5_caps *caps, - struct mlx5_cmd_query_hca_cap_mbox_out *out) +static void fw2drv_caps(struct mlx5_caps *caps, void *out) { struct mlx5_general_caps *gen = &caps->gen; - u16 t16; - - gen->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz; - gen->max_wqes = 1 << out->hca_cap.log_max_qp_sz; - gen->log_max_qp = out->hca_cap.log_max_qp & 0x1f; - gen->log_max_strq = out->hca_cap.log_max_strq_sz; - gen->log_max_srq = out->hca_cap.log_max_srqs & 0x1f; - gen->max_cqes = 1 << out->hca_cap.log_max_cq_sz; - gen->log_max_cq = out->hca_cap.log_max_cq & 0x1f; - gen->max_eqes = out->hca_cap.log_max_eq_sz; - gen->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f; - gen->log_max_eq = out->hca_cap.log_max_eq & 0xf; - gen->max_indirection = out->hca_cap.max_indirection; - gen->log_max_mrw_sz = out->hca_cap.log_max_mrw_sz; - gen->log_max_bsf_list_size = 0; - gen->log_max_klm_list_size = 0; - gen->log_max_ra_req_dc = out->hca_cap.log_max_ra_req_dc; - gen->log_max_ra_res_dc = out->hca_cap.log_max_ra_res_dc; - gen->log_max_ra_req_qp = out->hca_cap.log_max_ra_req_qp; - gen->log_max_ra_res_qp = out->hca_cap.log_max_ra_res_qp; - gen->max_qp_counters = be16_to_cpu(out->hca_cap.max_qp_count); - gen->pkey_table_size = get_pkey_table_size(be16_to_cpu(out->hca_cap.pkey_table_size)); - gen->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f; - gen->num_ports = out->hca_cap.num_ports & 0xf; - gen->log_max_msg = out->hca_cap.log_max_msg & 0x1f; - gen->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support); - gen->flags = be64_to_cpu(out->hca_cap.flags); - pr_debug("flags = 0x%llx\n", gen->flags); - gen->uar_sz = out->hca_cap.uar_sz; - gen->min_log_pg_sz = out->hca_cap.log_pg_sz; - t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size); - if (t16 & 0x8000) { - gen->bf_reg_size = 1 << (t16 & 0x1f); - gen->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE; - } else { - gen->bf_reg_size = 0; - gen->bf_regs_per_page = 0; - } - gen->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq); - gen->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq); - gen->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff; - gen->log_max_pd = out->hca_cap.log_max_pd & 0x1f; - gen->log_max_xrcd = out->hca_cap.log_max_xrcd; - gen->log_uar_page_sz = be16_to_cpu(out->hca_cap.log_uar_page_sz); + gen->max_srq_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_srq_sz); + gen->max_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_qp_sz); + gen->log_max_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_qp); + gen->log_max_strq = MLX5_GET_PR(cmd_hca_cap, out, log_max_strq_sz); + gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srqs); + gen->max_cqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_cq_sz); + gen->log_max_cq = MLX5_GET_PR(cmd_hca_cap, out, log_max_cq); + gen->max_eqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_eq_sz); + gen->log_max_mkey = MLX5_GET_PR(cmd_hca_cap, out, log_max_mkey); + gen->log_max_eq = MLX5_GET_PR(cmd_hca_cap, out, log_max_eq); + gen->max_indirection = MLX5_GET_PR(cmd_hca_cap, out, max_indirection); + gen->log_max_mrw_sz = MLX5_GET_PR(cmd_hca_cap, out, log_max_mrw_sz); + gen->log_max_bsf_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_bsf_list_size); + gen->log_max_klm_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_klm_list_size); + gen->log_max_ra_req_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_dc); + gen->log_max_ra_res_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_dc); + gen->log_max_ra_req_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_qp); + gen->log_max_ra_res_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_qp); + gen->max_qp_counters = MLX5_GET_PR(cmd_hca_cap, out, max_qp_cnt); + gen->pkey_table_size = get_pkey_table_size(MLX5_GET_PR(cmd_hca_cap, out, pkey_table_size)); + gen->local_ca_ack_delay = MLX5_GET_PR(cmd_hca_cap, out, local_ca_ack_delay); + gen->num_ports = MLX5_GET_PR(cmd_hca_cap, out, num_ports); + gen->log_max_msg = MLX5_GET_PR(cmd_hca_cap, out, log_max_msg); + gen->stat_rate_support = MLX5_GET_PR(cmd_hca_cap, out, stat_rate_support); + gen->flags = be64_to_cpu(*(__be64 *)MLX5_ADDR_OF(cmd_hca_cap, out, reserved_22)); + pr_debug("flags = 0x%llx\n", gen->flags); + gen->uar_sz = MLX5_GET_PR(cmd_hca_cap, out, uar_sz); + gen->min_log_pg_sz = MLX5_GET_PR(cmd_hca_cap, out, log_pg_sz); + gen->bf_reg_size = MLX5_GET_PR(cmd_hca_cap, out, bf); + gen->bf_reg_size = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_bf_reg_size); + gen->max_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq); + gen->max_rq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_rq); + gen->max_dc_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq_dc); + gen->max_qp_mcg = MLX5_GET_PR(cmd_hca_cap, out, max_qp_mcg); + gen->log_max_pd = MLX5_GET_PR(cmd_hca_cap, out, log_max_pd); + gen->log_max_xrcd = MLX5_GET_PR(cmd_hca_cap, out, log_max_xrcd); + gen->log_uar_page_sz = MLX5_GET_PR(cmd_hca_cap, out, log_uar_page_sz); } static const char *caps_opmod_str(u16 opmod) @@ -365,59 +360,61 @@ static const char *caps_opmod_str(u16 opmod) int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, u16 opmod) { - struct mlx5_cmd_query_hca_cap_mbox_out *out; - struct mlx5_cmd_query_hca_cap_mbox_in in; + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; + int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *out; int err; - memset(&in, 0, sizeof(in)); - out = kzalloc(sizeof(*out), GFP_KERNEL); + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); if (!out) return -ENOMEM; + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto query_ex; - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - in.hdr.opmod = cpu_to_be16(opmod); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); - - err = mlx5_cmd_status_to_err(&out->hdr); + err = mlx5_cmd_status_to_err_v2(out); if (err) { mlx5_core_warn(dev, "query max hca cap failed, %d\n", err); goto query_ex; } mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod)); - fw2drv_caps(caps, out); + fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct)); query_ex: kfree(out); return err; } -static int set_caps(struct mlx5_core_dev *dev, - struct mlx5_cmd_set_hca_cap_mbox_in *in) +static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) { - struct mlx5_cmd_set_hca_cap_mbox_out out; + u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; int err; - memset(&out, 0, sizeof(out)); + memset(out, 0, sizeof(out)); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); - err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); + MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); if (err) return err; - err = mlx5_cmd_status_to_err(&out.hdr); + err = mlx5_cmd_status_to_err_v2(out); return err; } static int handle_hca_cap(struct mlx5_core_dev *dev) { - struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; + void *set_ctx = NULL; struct mlx5_profile *prof = dev->profile; struct mlx5_caps *cur_caps = NULL; struct mlx5_caps *max_caps = NULL; int err = -ENOMEM; + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); - set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL); + set_ctx = kzalloc(set_sz, GFP_KERNEL); if (!set_ctx) goto query_ex; @@ -446,8 +443,9 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) /* disable checksum */ cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; - copy_rw_fields(&set_ctx->hca_cap, &cur_caps->gen); - err = set_caps(dev, set_ctx); + copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, hca_capability_struct), + cur_caps); + err = set_caps(dev, set_ctx, set_sz); query_ex: kfree(cur_caps); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0032687f58c7..1d67fd32e71c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -327,98 +327,6 @@ struct mlx5_cmd_query_adapter_mbox_out { u8 vsd_psid[16]; }; -struct mlx5_hca_cap { - u8 rsvd1[16]; - u8 log_max_srq_sz; - u8 log_max_qp_sz; - u8 rsvd2; - u8 log_max_qp; - u8 log_max_strq_sz; - u8 log_max_srqs; - u8 rsvd4[2]; - u8 rsvd5; - u8 log_max_cq_sz; - u8 rsvd6; - u8 log_max_cq; - u8 log_max_eq_sz; - u8 log_max_mkey; - u8 rsvd7; - u8 log_max_eq; - u8 max_indirection; - u8 log_max_mrw_sz; - u8 log_max_bsf_list_sz; - u8 log_max_klm_list_sz; - u8 rsvd_8_0; - u8 log_max_ra_req_dc; - u8 rsvd_8_1; - u8 log_max_ra_res_dc; - u8 rsvd9; - u8 log_max_ra_req_qp; - u8 rsvd10; - u8 log_max_ra_res_qp; - u8 pad_cap; - u8 rsvd11[3]; - __be16 max_qp_count; - __be16 pkey_table_size; - u8 rsvd13; - u8 local_ca_ack_delay; - u8 rsvd14; - u8 num_ports; - u8 log_max_msg; - u8 rsvd15[3]; - __be16 stat_rate_support; - u8 rsvd16[2]; - __be64 flags; - u8 rsvd17; - u8 uar_sz; - u8 rsvd18; - u8 log_pg_sz; - __be16 bf_log_bf_reg_size; - u8 rsvd19[4]; - __be16 max_desc_sz_sq; - u8 rsvd20[2]; - __be16 max_desc_sz_rq; - u8 rsvd21[2]; - __be16 max_desc_sz_sq_dc; - __be32 max_qp_mcg; - u8 rsvd22[3]; - u8 log_max_mcg; - u8 rsvd23; - u8 log_max_pd; - u8 rsvd24; - u8 log_max_xrcd; - u8 rsvd25[42]; - __be16 log_uar_page_sz; - u8 rsvd26[108]; -}; - - -struct mlx5_cmd_query_hca_cap_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - - -struct mlx5_cmd_query_hca_cap_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; - struct mlx5_hca_cap hca_cap; -}; - - -struct mlx5_cmd_set_hca_cap_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; - struct mlx5_hca_cap hca_cap; -}; - - -struct mlx5_cmd_set_hca_cap_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - - struct mlx5_cmd_init_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd0[2]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6f48dc793b9f..c439f9c59b93 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -641,6 +641,7 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); +int mlx5_cmd_status_to_err_v2(void *ptr); int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, u16 opmod); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index df3bd9b5fbcf..5f48b8f592c5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -140,4 +140,210 @@ enum { MLX5_CMD_OP_MAX = 0x911 }; +struct mlx5_ifc_cmd_hca_cap_bits { + u8 reserved_0[0x80]; + + u8 log_max_srq_sz[0x8]; + u8 log_max_qp_sz[0x8]; + u8 reserved_1[0xb]; + u8 log_max_qp[0x5]; + + u8 log_max_strq_sz[0x8]; + u8 reserved_2[0x3]; + u8 log_max_srqs[0x5]; + u8 reserved_3[0x10]; + + u8 reserved_4[0x8]; + u8 log_max_cq_sz[0x8]; + u8 reserved_5[0xb]; + u8 log_max_cq[0x5]; + + u8 log_max_eq_sz[0x8]; + u8 reserved_6[0x2]; + u8 log_max_mkey[0x6]; + u8 reserved_7[0xc]; + u8 log_max_eq[0x4]; + + u8 max_indirection[0x8]; + u8 reserved_8[0x1]; + u8 log_max_mrw_sz[0x7]; + u8 reserved_9[0x2]; + u8 log_max_bsf_list_size[0x6]; + u8 reserved_10[0x2]; + u8 log_max_klm_list_size[0x6]; + + u8 reserved_11[0xa]; + u8 log_max_ra_req_dc[0x6]; + u8 reserved_12[0xa]; + u8 log_max_ra_res_dc[0x6]; + + u8 reserved_13[0xa]; + u8 log_max_ra_req_qp[0x6]; + u8 reserved_14[0xa]; + u8 log_max_ra_res_qp[0x6]; + + u8 pad_cap[0x1]; + u8 cc_query_allowed[0x1]; + u8 cc_modify_allowed[0x1]; + u8 reserved_15[0x1d]; + + u8 reserved_16[0x6]; + u8 max_qp_cnt[0xa]; + u8 pkey_table_size[0x10]; + + u8 eswitch_owner[0x1]; + u8 reserved_17[0xa]; + u8 local_ca_ack_delay[0x5]; + u8 reserved_18[0x8]; + u8 num_ports[0x8]; + + u8 reserved_19[0x3]; + u8 log_max_msg[0x5]; + u8 reserved_20[0x18]; + + u8 stat_rate_support[0x10]; + u8 reserved_21[0x10]; + + u8 reserved_22[0x10]; + u8 cmdif_checksum[0x2]; + u8 sigerr_cqe[0x1]; + u8 reserved_23[0x1]; + u8 wq_signature[0x1]; + u8 sctr_data_cqe[0x1]; + u8 reserved_24[0x1]; + u8 sho[0x1]; + u8 tph[0x1]; + u8 rf[0x1]; + u8 dc[0x1]; + u8 reserved_25[0x2]; + u8 roce[0x1]; + u8 atomic[0x1]; + u8 rsz_srq[0x1]; + + u8 cq_oi[0x1]; + u8 cq_resize[0x1]; + u8 cq_moderation[0x1]; + u8 sniffer_rule_flow[0x1]; + u8 sniffer_rule_vport[0x1]; + u8 sniffer_rule_phy[0x1]; + u8 reserved_26[0x1]; + u8 pg[0x1]; + u8 block_lb_mc[0x1]; + u8 reserved_27[0x3]; + u8 cd[0x1]; + u8 reserved_28[0x1]; + u8 apm[0x1]; + u8 reserved_29[0x7]; + u8 qkv[0x1]; + u8 pkv[0x1]; + u8 reserved_30[0x4]; + u8 xrc[0x1]; + u8 ud[0x1]; + u8 uc[0x1]; + u8 rc[0x1]; + + u8 reserved_31[0xa]; + u8 uar_sz[0x6]; + u8 reserved_32[0x8]; + u8 log_pg_sz[0x8]; + + u8 bf[0x1]; + u8 reserved_33[0xa]; + u8 log_bf_reg_size[0x5]; + u8 reserved_34[0x10]; + + u8 reserved_35[0x10]; + u8 max_wqe_sz_sq[0x10]; + + u8 reserved_36[0x10]; + u8 max_wqe_sz_rq[0x10]; + + u8 reserved_37[0x10]; + u8 max_wqe_sz_sq_dc[0x10]; + + u8 reserved_38[0x7]; + u8 max_qp_mcg[0x19]; + + u8 reserved_39[0x18]; + u8 log_max_mcg[0x8]; + + u8 reserved_40[0xb]; + u8 log_max_pd[0x5]; + u8 reserved_41[0xb]; + u8 log_max_xrcd[0x5]; + + u8 reserved_42[0x20]; + + u8 reserved_43[0x3]; + u8 log_max_rq[0x5]; + u8 reserved_44[0x3]; + u8 log_max_sq[0x5]; + u8 reserved_45[0x3]; + u8 log_max_tir[0x5]; + u8 reserved_46[0x3]; + u8 log_max_tis[0x5]; + + u8 reserved_47[0x13]; + u8 log_max_rq_per_tir[0x5]; + u8 reserved_48[0x3]; + u8 log_max_tis_per_sq[0x5]; + + u8 reserved_49[0xe0]; + + u8 reserved_50[0x10]; + u8 log_uar_page_sz[0x10]; + + u8 reserved_51[0x100]; + + u8 reserved_52[0x1f]; + u8 cqe_zip[0x1]; + + u8 cqe_zip_timeout[0x10]; + u8 cqe_zip_max_num[0x10]; + + u8 reserved_53[0x220]; +}; + +struct mlx5_ifc_set_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_cmd_hca_cap_bits hca_capability_struct; +}; + +struct mlx5_ifc_query_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 capability_struct[256][0x8]; +}; + +struct mlx5_ifc_set_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 5903325a64834211daf63a62db3b35ee580cb8bf Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:45 +0300 Subject: net/mlx5_core: Identify resources by their type This patch puts a common part as the first field of mlx5_core_qp. This field is used to identify which resource generated an event. This is required since upcoming new resource types such as DC targets are allocated for the same numerical space as regular QPs and may generate the same events. By searching the resource in the same table we can then look at the common field to identify the resource. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 12 +++--- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 57 ++++++++++++++++++++-------- include/linux/mlx5/driver.h | 13 ++++++- include/linux/mlx5/qp.h | 3 +- 4 files changed, 60 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 11b9b840ad4d..ed53291468f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -198,7 +198,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) int eqes_found = 0; int set_ci = 0; u32 cqn; - u32 srqn; + u32 rsn; u8 port; while ((eqe = next_eqe_sw(eq))) { @@ -224,18 +224,18 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) case MLX5_EVENT_TYPE_PATH_MIG_FAILED: case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; mlx5_core_dbg(dev, "event %s(%d) arrived\n", eqe_type_str(eqe->type), eqe->type); - mlx5_qp_event(dev, be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff, - eqe->type); + mlx5_rsc_event(dev, rsn, eqe->type); break; case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: - srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n", - eqe_type_str(eqe->type), eqe->type, srqn); - mlx5_srq_event(dev, srqn, eqe->type); + eqe_type_str(eqe->type), eqe->type, rsn); + mlx5_srq_event(dev, rsn, eqe->type); break; case MLX5_EVENT_TYPE_CMD: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 415b67ce379e..5261a2b0da43 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -39,28 +39,53 @@ #include "mlx5_core.h" -void mlx5_qp_event(struct mlx5_core_dev *dev, u32 qpn, int event_type) +static struct mlx5_core_rsc_common *mlx5_get_rsc(struct mlx5_core_dev *dev, + u32 rsn) { struct mlx5_qp_table *table = &dev->priv.qp_table; - struct mlx5_core_qp *qp; + struct mlx5_core_rsc_common *common; spin_lock(&table->lock); - qp = radix_tree_lookup(&table->tree, qpn); - if (qp) - atomic_inc(&qp->refcount); + common = radix_tree_lookup(&table->tree, rsn); + if (common) + atomic_inc(&common->refcount); spin_unlock(&table->lock); - if (!qp) { - mlx5_core_warn(dev, "Async event for bogus QP 0x%x\n", qpn); - return; + if (!common) { + mlx5_core_warn(dev, "Async event for bogus resource 0x%x\n", + rsn); + return NULL; } + return common; +} - qp->event(qp, event_type); +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) +{ + if (atomic_dec_and_test(&common->refcount)) + complete(&common->free); +} + +void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) +{ + struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn); + struct mlx5_core_qp *qp; + + if (!common) + return; + + switch (common->res) { + case MLX5_RES_QP: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); + break; + + default: + mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); + } - if (atomic_dec_and_test(&qp->refcount)) - complete(&qp->free); + mlx5_core_put_rsc(common); } int mlx5_core_create_qp(struct mlx5_core_dev *dev, @@ -92,6 +117,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn = be32_to_cpu(out.qpn) & 0xffffff; mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); + qp->common.res = MLX5_RES_QP; spin_lock_irq(&table->lock); err = radix_tree_insert(&table->tree, qp->qpn, qp); spin_unlock_irq(&table->lock); @@ -106,9 +132,9 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn); qp->pid = current->pid; - atomic_set(&qp->refcount, 1); + atomic_set(&qp->common.refcount, 1); atomic_inc(&dev->num_qps); - init_completion(&qp->free); + init_completion(&qp->common.free); return 0; @@ -138,9 +164,8 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, radix_tree_delete(&table->tree, qp->qpn); spin_unlock_irqrestore(&table->lock, flags); - if (atomic_dec_and_test(&qp->refcount)) - complete(&qp->free); - wait_for_completion(&qp->free); + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); memset(&in, 0, sizeof(in)); memset(&out, 0, sizeof(out)); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c439f9c59b93..246310dc8bef 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -375,6 +375,16 @@ struct mlx5_core_mr { u32 pd; }; +enum mlx5_res_type { + MLX5_RES_QP, +}; + +struct mlx5_core_rsc_common { + enum mlx5_res_type res; + atomic_t refcount; + struct completion free; +}; + struct mlx5_core_srq { u32 srqn; int max; @@ -700,7 +710,7 @@ int mlx5_eq_init(struct mlx5_core_dev *dev); void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); -void mlx5_qp_event(struct mlx5_core_dev *dev, u32 qpn, int event_type); +void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); @@ -737,6 +747,7 @@ void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); static inline u32 mlx5_mkey_to_idx(u32 mkey) { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 9709b30e2d69..7c4c0f1f5805 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -342,10 +342,9 @@ struct mlx5_stride_block_ctrl_seg { }; struct mlx5_core_qp { + struct mlx5_core_rsc_common common; /* must be first */ void (*event) (struct mlx5_core_qp *, int); int qpn; - atomic_t refcount; - struct completion free; struct mlx5_rsc_debug *dbg; int pid; }; -- cgit v1.2.3 From efc98d08e1ec4fd131f794370b274dceaf32c958 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:08 -0700 Subject: fou: eliminate IPv4,v6 specific GRO functions This patch removes fou[46]_gro_receive and fou[46]_gro_complete functions. The v4 or v6 variants were chosen for the UDP offloads based on the address family of the socket this is not necessary or correct. Alternatively, this patch adds is_ipv6 to napi_gro_skb. This is set in udp6_gro_receive and unset in udp4_gro_receive. In fou_gro_receive the value is used to select the correct inet_offloads for the protocol of the outer IP header. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/ipv4/fou.c | 48 ++++++++--------------------------------------- net/ipv4/udp_offload.c | 1 + net/ipv6/udp_offload.c | 1 + 4 files changed, 13 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 910fb17ad148..22d54b9b700d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1886,6 +1886,9 @@ struct napi_gro_cb { /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; + /* Used in foo-over-udp, set in udp[46]_gro_receive */ + u8 is_ipv6:1; + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index dced89fbe480..7e2126a31f2e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -65,14 +65,15 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) } static struct sk_buff **fou_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - const struct net_offload **offloads) + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff **pp = NULL; u8 proto = NAPI_GRO_CB(skb)->proto; + const struct net_offload **offloads; rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out_unlock; @@ -85,14 +86,15 @@ out_unlock: return pp; } -static int fou_gro_complete(struct sk_buff *skb, int nhoff, - const struct net_offload **offloads) +static int fou_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; u8 proto = NAPI_GRO_CB(skb)->proto; int err = -ENOSYS; + const struct net_offload **offloads; rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; @@ -105,28 +107,6 @@ out_unlock: return err; } -static struct sk_buff **fou4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - return fou_gro_receive(head, skb, inet_offloads); -} - -static int fou4_gro_complete(struct sk_buff *skb, int nhoff) -{ - return fou_gro_complete(skb, nhoff, inet_offloads); -} - -static struct sk_buff **fou6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - return fou_gro_receive(head, skb, inet6_offloads); -} - -static int fou6_gro_complete(struct sk_buff *skb, int nhoff) -{ - return fou_gro_complete(skb, nhoff, inet6_offloads); -} - static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -199,20 +179,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - switch (cfg->udp_config.family) { - case AF_INET: - fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete; - break; - case AF_INET6: - fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete; - break; - default: - err = -EPFNOSUPPORT; - goto error; - } - + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; fou->udp_offloads.port = cfg->udp_config.local_udp_port; fou->udp_offloads.ipproto = cfg->protocol; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8c35f2c939ee..507310ef4b56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -334,6 +334,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, inet_gro_compute_pseudo); skip: + NAPI_GRO_CB(skb)->is_ipv6 = 0; return udp_gro_receive(head, skb, uh); flush: diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 8f96988c1db2..6b8f543f6ac6 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -140,6 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, ip6_gro_compute_pseudo); skip: + NAPI_GRO_CB(skb)->is_ipv6 = 1; return udp_gro_receive(head, skb, uh); flush: -- cgit v1.2.3 From 37dd0247797b168ad1cc7f5dbec825a1ee66535b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:09 -0700 Subject: gue: Receive side for Generic UDP Encapsulation This patch adds support receiving for GUE packets in the fou module. The fou module now supports direct foo-over-udp (no encapsulation header) and GUE. To support this a type parameter is added to the fou netlink parameters. For a GUE socket we define gue_udp_recv, gue_gro_receive, and gue_gro_complete to handle the specifics of the GUE protocol. Most of the code to manage and configure sockets is common with the fou. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/gue.h | 23 ++++++ include/uapi/linux/fou.h | 7 ++ net/ipv4/fou.c | 196 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 include/net/gue.h (limited to 'include') diff --git a/include/net/gue.h b/include/net/gue.h new file mode 100644 index 000000000000..b6c332788084 --- /dev/null +++ b/include/net/gue.h @@ -0,0 +1,23 @@ +#ifndef __NET_GUE_H +#define __NET_GUE_H + +struct guehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hlen:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + hlen:4; +#else +#error "Please fix " +#endif + __u8 next_hdr; + __u16 flags; + }; + __u32 word; + }; +}; + +#endif diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index e03376de453d..8df06894da23 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -13,6 +13,7 @@ enum { FOU_ATTR_PORT, /* u16 */ FOU_ATTR_AF, /* u8 */ FOU_ATTR_IPPROTO, /* u8 */ + FOU_ATTR_TYPE, /* u8 */ __FOU_ATTR_MAX, }; @@ -27,6 +28,12 @@ enum { __FOU_CMD_MAX, }; +enum { + FOU_ENCAP_UNSPEC, + FOU_ENCAP_DIRECT, + FOU_ENCAP_GUE, +}; + #define FOU_CMD_MAX (__FOU_CMD_MAX - 1) #endif /* _UAPI_LINUX_FOU_H */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 7e2126a31f2e..efa70ad44906 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ struct fou { }; struct fou_cfg { + u16 type; u8 protocol; struct udp_port_cfg udp_config; }; @@ -64,6 +66,41 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) sizeof(struct udphdr)); } +static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + size_t len; + struct guehdr *guehdr; + struct udphdr *uh; + + if (!fou) + return 1; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) + goto drop; + + uh = udp_hdr(skb); + guehdr = (struct guehdr *)&uh[1]; + + len += guehdr->hlen << 2; + if (!pskb_may_pull(skb, len)) + goto drop; + + if (guehdr->version != 0) + goto drop; + + if (guehdr->flags) { + /* No support yet */ + goto drop; + } + + return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); +drop: + kfree_skb(skb); + return 0; +} + static struct sk_buff **fou_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -107,6 +144,112 @@ out_unlock: return err; } +static struct sk_buff **gue_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + u8 proto; + struct guehdr *guehdr; + unsigned int hlen, guehlen; + unsigned int off; + int flush = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out; + } + + proto = guehdr->next_hdr; + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + hlen = off + guehlen; + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out_unlock; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + const struct guehdr *guehdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + guehdr2 = (struct guehdr *)(p->data + off); + + /* Compare base GUE header to be equal (covers + * hlen, version, next_hdr, and flags. + */ + if (guehdr->word != guehdr2->word) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Compare optional fields are the same. */ + if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], + guehdr->hlen << 2)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, guehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, guehdr, guehlen); + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gue_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload **offloads; + struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); + const struct net_offload *ops; + unsigned int guehlen; + u8 proto; + int err = -ENOENT; + + proto = guehdr->next_hdr; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + +out_unlock: + rcu_read_unlock(); + return err; +} + static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -142,6 +285,28 @@ static void fou_release(struct fou *fou) kfree(fou); } +static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->protocol = cfg->protocol; + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + return 0; +} + +static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = gue_udp_recv; + fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; + fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + + return 0; +} + static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { @@ -164,10 +329,24 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk = sock->sk; - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ - fou->protocol = cfg->protocol; - fou->port = cfg->udp_config.local_udp_port; - udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->port = cfg->udp_config.local_udp_port; + + /* Initial for fou type */ + switch (cfg->type) { + case FOU_ENCAP_DIRECT: + err = fou_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + case FOU_ENCAP_GUE: + err = gue_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + default: + err = -EINVAL; + goto error; + } udp_sk(sk)->encap_type = 1; udp_encap_enable(); @@ -179,11 +358,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; - fou->udp_offloads.ipproto = cfg->protocol; - if (cfg->udp_config.family == AF_INET) { err = udp_add_offload(&fou->udp_offloads); if (err) @@ -240,6 +414,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, [FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, @@ -267,6 +442,9 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_IPPROTO]) cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + if (info->attrs[FOU_ATTR_TYPE]) + cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); + return 0; } -- cgit v1.2.3 From bc1fc390e1728672b5b343b85185fcc1fe41043b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:10 -0700 Subject: ip_tunnel: Add GUE support This patch allows configuring IPIP, sit, and GRE tunnels to use GUE. This is very similar to fou excpet that we need to insert the GUE header in addition to the UDP header on transmit. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_tunnel.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 7c832afdfa94..280d9e092283 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -64,6 +64,7 @@ enum { enum tunnel_encap_types { TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_FOU, + TUNNEL_ENCAP_GUE, }; #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d9c9dc4ffeaf..0bb8e141eacc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,6 +56,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -495,6 +496,8 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e) return 0; case TUNNEL_ENCAP_FOU: return sizeof(struct udphdr); + case TUNNEL_ENCAP_GUE: + return sizeof(struct udphdr) + sizeof(struct guehdr); default: return -EINVAL; } @@ -546,6 +549,15 @@ static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, skb_reset_transport_header(skb); uh = udp_hdr(skb); + if (e->type == TUNNEL_ENCAP_GUE) { + struct guehdr *guehdr = (struct guehdr *)&uh[1]; + + guehdr->version = 0; + guehdr->hlen = 0; + guehdr->flags = 0; + guehdr->next_hdr = *protocol; + } + uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); @@ -565,6 +577,7 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, case TUNNEL_ENCAP_NONE: return 0; case TUNNEL_ENCAP_FOU: + case TUNNEL_ENCAP_GUE: return fou_build_header(skb, &t->encap, t->encap_hlen, protocol, fl4); default: -- cgit v1.2.3 From c8753d55afb436fd6a25c8bbe8d783f6dcf1c9f8 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Thu, 2 Oct 2014 10:00:43 -0700 Subject: net: Cleanup skb cloning by adding SKB_FCLONE_FREE SKB_FCLONE_UNAVAILABLE has overloaded meaning depending on type of skb. 1: If skb is allocated from head_cache, it indicates fclone is not available. 2: If skb is a companion fclone skb (allocated from fclone_cache), it indicates it is available to be used. To avoid confusion for case 2 above, this patch replaces SKB_FCLONE_UNAVAILABLE with SKB_FCLONE_FREE where appropriate. For fclone companion skbs, this indicates it is free for use. SKB_FCLONE_UNAVAILABLE will now simply indicate skb is from head_cache and cannot / will not have a companion fclone. Signed-off-by: Vijay Subramanian Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++--- net/core/skbuff.c | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7c5036d11feb..3a5ec7638627 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -339,9 +339,10 @@ struct skb_shared_info { enum { - SKB_FCLONE_UNAVAILABLE, - SKB_FCLONE_ORIG, - SKB_FCLONE_CLONE, + SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ + SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ + SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ + SKB_FCLONE_FREE, /* this companion fclone skb is available */ }; enum { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a0b312fa3047..28916e47f959 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; atomic_set(&fclones->fclone_ref, 1); - fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE; + fclones->skb2.fclone = SKB_FCLONE_FREE; fclones->skb2.pfmemalloc = pfmemalloc; } out: @@ -542,7 +542,7 @@ static void kfree_skbmem(struct sk_buff *skb) fclones = container_of(skb, struct sk_buff_fclones, skb2); /* Warning : We must perform the atomic_dec_and_test() before - * setting skb->fclone back to SKB_FCLONE_UNAVAILABLE, otherwise + * setting skb->fclone back to SKB_FCLONE_FREE, otherwise * skb_clone() could set clone_ref to 2 before our decrement. * Anyway, if we are going to free the structure, no need to * rewrite skb->fclone. @@ -553,7 +553,7 @@ static void kfree_skbmem(struct sk_buff *skb) /* The clone portion is available for * fast-cloning again. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; + skb->fclone = SKB_FCLONE_FREE; } break; } @@ -874,7 +874,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { + n->fclone == SKB_FCLONE_FREE) { n->fclone = SKB_FCLONE_CLONE; /* As our fastclone was free, clone_ref must be 1 at this point. * We could use atomic_inc() here, but it is faster -- cgit v1.2.3 From dd3619f2ed5bd5ffce90f4fd8361ccd46d59b9b6 Mon Sep 17 00:00:00 2001 From: Sébastien Barré Date: Thu, 2 Oct 2014 21:15:22 +0200 Subject: Removed unused inet6 address state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the inet6 state INET6_IFADDR_STATE_UP only appeared in its definition. Cc: Christoph Paasch Cc: Herbert Xu Signed-off-by: Sébastien Barré Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index d07b1a64b4e7..55a8d4056cc9 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -35,7 +35,6 @@ enum { INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, - INET6_IFADDR_STATE_UP, INET6_IFADDR_STATE_DEAD, }; -- cgit v1.2.3 From bdf6fa52f01b941d4a80372d56de465bdbbd1d23 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 3 Oct 2014 18:16:20 -0400 Subject: sctp: handle association restarts when the socket is closed. Currently association restarts do not take into consideration the state of the socket. When a restart happens, the current assocation simply transitions into established state. This creates a condition where a remote system, through a the restart procedure, may create a local association that is no way reachable by user. The conditions to trigger this are as follows: 1) Remote does not acknoledge some data causing data to remain outstanding. 2) Local application calls close() on the socket. Since data is still outstanding, the association is placed in SHUTDOWN_PENDING state. However, the socket is closed. 3) The remote tries to create a new association, triggering a restart on the local system. The association moves from SHUTDOWN_PENDING to ESTABLISHED. At this point, it is no longer reachable by any socket on the local system. This patch addresses the above situation by moving the newly ESTABLISHED association into SHUTDOWN-SENT state and bundling a SHUTDOWN after the COOKIE-ACK chunk. This way, the restarted associate immidiately enters the shutdown procedure and forces the termination of the unreachable association. Reported-by: David Laight Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/command.h | 2 +- net/sctp/sm_statefuns.c | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index f22538e68245..d4a20d00461c 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -115,7 +115,7 @@ typedef enum { * analysis of the state functions, but in reality just taken from * thin air in the hopes othat we don't trigger a kernel panic. */ -#define SCTP_MAX_NUM_COMMANDS 14 +#define SCTP_MAX_NUM_COMMANDS 20 typedef union { void *zero_all; /* Set to NULL to clear the entire union */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d3f1ea460c50..c8f606324134 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1775,9 +1775,22 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net, /* Update the content of current association. */ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); - sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, - SCTP_STATE(SCTP_STATE_ESTABLISHED)); - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + if (sctp_state(asoc, SHUTDOWN_PENDING) && + (sctp_sstate(asoc->base.sk, CLOSING) || + sock_flag(asoc->base.sk, SOCK_DEAD))) { + /* if were currently in SHUTDOWN_PENDING, but the socket + * has been closed by user, don't transition to ESTABLISHED. + * Instead trigger SHUTDOWN bundled with COOKIE_ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, + SCTP_ST_CHUNK(0), NULL, + commands); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + } return SCTP_DISPOSITION_CONSUME; nomem_ev: -- cgit v1.2.3 From 0b5e8b8eeae40bae6ad7c7e91c97c3c0d0e57882 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 3 Oct 2014 15:35:28 -0700 Subject: net: Add Geneve tunneling protocol driver This adds a device level support for Geneve -- Generic Network Virtualization Encapsulation. The protocol is documented at http://tools.ietf.org/html/draft-gross-geneve-01 Only protocol layer Geneve support is provided by this driver. Openvswitch can be used for configuring, set up and tear down functional Geneve tunnels. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/geneve.h | 91 ++++++++++++ include/net/ip_tunnels.h | 2 + net/ipv4/Kconfig | 14 ++ net/ipv4/Makefile | 1 + net/ipv4/geneve.c | 373 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 481 insertions(+) create mode 100644 include/net/geneve.h create mode 100644 net/ipv4/geneve.c (limited to 'include') diff --git a/include/net/geneve.h b/include/net/geneve.h new file mode 100644 index 000000000000..ce98865318bf --- /dev/null +++ b/include/net/geneve.h @@ -0,0 +1,91 @@ +#ifndef __NET_GENEVE_H +#define __NET_GENEVE_H 1 + +#include + +struct geneve_sock; + +typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); + +struct geneve_sock { + struct hlist_node hlist; + geneve_rcv_t *rcv; + void *rcv_data; + struct work_struct del_work; + struct socket *sock; + struct rcu_head rcu; + atomic_t refcnt; + struct udp_offload udp_offloads; +}; + +/* Geneve Header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |Ver| Opt Len |O|C| Rsvd. | Protocol Type | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Virtual Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Variable Length Options | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Option Header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Option Class | Type |R|R|R| Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Variable Option Data | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + +struct geneve_opt { + __be16 opt_class; + u8 type; +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 length:5; + u8 r3:1; + u8 r2:1; + u8 r1:1; +#else + u8 r1:1; + u8 r2:1; + u8 r3:1; + u8 length:5; +#endif + u8 opt_data[]; +}; + +#define GENEVE_CRIT_OPT_TYPE (1 << 7) + +struct genevehdr { +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 opt_len:6; + u8 ver:2; + u8 rsvd1:6; + u8 critical:1; + u8 oam:1; +#else + u8 ver:2; + u8 opt_len:6; + u8 oam:1; + u8 critical:1; + u8 rsvd1:6; +#endif + __be16 proto_type; + u8 vni[3]; + u8 rsvd2; + struct geneve_opt options[]; +}; + +#define GENEVE_VER 0 +#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6); + +void geneve_sock_release(struct geneve_sock *vs); + +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool xnet); +#endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 7f538ba6e267..a9ce1556d773 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -95,6 +95,8 @@ struct ip_tunnel { #define TUNNEL_VERSION __cpu_to_be16(0x40) #define TUNNEL_NO_KEY __cpu_to_be16(0x80) #define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) struct tnl_ptk_info { __be16 flags; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 69fb37854449..c2035447e649 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -453,6 +453,20 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config GENEVE + tristate "Generic Network Virtualization Encapsulation (Geneve)" + depends on INET + select NET_IP_TUNNEL + select NET_UDP_TUNNEL + ---help--- + This allows one to create Geneve virtual interfaces that provide + Layer 2 Networks over Layer 3 Networks. Geneve is often used + to tunnel virtual network infrastructure in virtualized environments. + For more information see: + http://tools.ietf.org/html/draft-gross-geneve-01 + + To compile this driver as a module, choose M here: the module + config TCP_CONG_CUBIC tristate "CUBIC TCP" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d8105787c199..518c04ed666e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c new file mode 100644 index 000000000000..f008c5515f48 --- /dev/null +++ b/net/ipv4/geneve.c @@ -0,0 +1,373 @@ +/* + * Geneve: Generic Network Virtualization Encapsulation + * + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#include +#endif + +#define PORT_HASH_BITS 8 +#define PORT_HASH_SIZE (1<sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; +} + +/* Find geneve socket based on network namespace and UDP port */ +static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port) +{ + struct geneve_sock *gs; + + hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { + if (inet_sk(gs->sock->sk)->inet_sport == port) + return gs; + } + + return NULL; +} + +static void geneve_build_header(struct genevehdr *geneveh, + __be16 tun_flags, u8 vni[3], + u8 options_len, u8 *options) +{ + geneveh->ver = GENEVE_VER; + geneveh->opt_len = options_len / 4; + geneveh->oam = !!(tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + geneveh->rsvd1 = 0; + memcpy(geneveh->vni, vni, 3); + geneveh->proto_type = htons(ETH_P_TEB); + geneveh->rsvd2 = 0; + + memcpy(geneveh->options, options, options_len); +} + +/* Transmit a fully formated Geneve frame. + * + * When calling this function. The skb->data should point + * to the geneve header which is fully formed. + * + * This function will add other UDP tunnel headers. + */ +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool xnet) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + + skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + return err; + + if (vlan_tx_tag_present(skb)) { + if (unlikely(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) { + err = -ENOMEM; + return err; + } + skb->vlan_tci = 0; + } + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + + return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, + tos, ttl, df, src_port, dst_port, xnet); +} +EXPORT_SYMBOL_GPL(geneve_xmit_skb); + +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +/* Callback from net/ipv4/udp.c to receive packets */ +static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + int opts_len; + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) + goto error; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + + if (unlikely(geneveh->ver != GENEVE_VER)) + goto error; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + goto error; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB))) + goto drop; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + gs->rcv(gs, skb); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Let the UDP layer deal with the skb */ + return 1; +} + +static void geneve_del_work(struct work_struct *work) +{ + struct geneve_sock *gs = container_of(work, struct geneve_sock, + del_work); + + udp_tunnel_sock_release(gs->sock); + kfree_rcu(gs, rcu); +} + +static struct socket *geneve_create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = INADDR_ANY; + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +/* Create new listen socket if needed */ +static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool ipv6) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + struct socket *sock; + struct udp_tunnel_sock_cfg tunnel_cfg; + + gs = kzalloc(sizeof(*gs), GFP_KERNEL); + if (!gs) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&gs->del_work, geneve_del_work); + + sock = geneve_create_sock(net, ipv6, port); + if (IS_ERR(sock)) { + kfree(gs); + return ERR_CAST(sock); + } + + gs->sock = sock; + atomic_set(&gs->refcnt, 1); + gs->rcv = rcv; + gs->rcv_data = data; + + /* Initialize the geneve udp offloads structure */ + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = NULL; + gs->udp_offloads.callbacks.gro_complete = NULL; + + spin_lock(&gn->sock_lock); + hlist_add_head_rcu(&gs->hlist, gs_head(net, port)); + geneve_notify_add_rx_port(gs); + spin_unlock(&gn->sock_lock); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = gs; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); + + return gs; +} + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) +{ + struct geneve_sock *gs; + + gs = geneve_socket_create(net, port, rcv, data, ipv6); + if (!IS_ERR(gs)) + return gs; + + if (no_share) /* Return error if sharing is not allowed. */ + return ERR_PTR(-EINVAL); + + gs = geneve_find_sock(net, port); + if (gs) { + if (gs->rcv == rcv) + atomic_inc(&gs->refcnt); + else + gs = ERR_PTR(-EBUSY); + } else { + gs = ERR_PTR(-EINVAL); + } + + return gs; +} +EXPORT_SYMBOL_GPL(geneve_sock_add); + +void geneve_sock_release(struct geneve_sock *gs) +{ + if (!atomic_dec_and_test(&gs->refcnt)) + return; + + queue_work(geneve_wq, &gs->del_work); +} +EXPORT_SYMBOL_GPL(geneve_sock_release); + +static __net_init int geneve_init_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + unsigned int h; + + spin_lock_init(&gn->sock_lock); + + for (h = 0; h < PORT_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&gn->sock_list[h]); + + return 0; +} + +static struct pernet_operations geneve_net_ops = { + .init = geneve_init_net, + .exit = NULL, + .id = &geneve_net_id, + .size = sizeof(struct geneve_net), +}; + +static int __init geneve_init_module(void) +{ + int rc; + + geneve_wq = alloc_workqueue("geneve", 0, 0); + if (!geneve_wq) + return -ENOMEM; + + rc = register_pernet_subsys(&geneve_net_ops); + if (rc) + return rc; + + pr_info("Geneve driver\n"); + + return 0; +} +late_initcall(geneve_init_module); + +static void __exit geneve_cleanup_module(void) +{ + destroy_workqueue(geneve_wq); +} +module_exit(geneve_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jesse Gross "); +MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); +MODULE_ALIAS_RTNL_LINK("geneve"); -- cgit v1.2.3 From 67fa034194bf82a3d5ca841759d921297daa63ca Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:30 -0700 Subject: openvswitch: Add support for matching on OAM packets. Some tunnel formats have mechanisms for indicating that packets are OAM frames that should be handled specially (either as high priority or not forwarded beyond an endpoint). This provides support for allowing those types of packets to be matched. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/datapath.c | 1 + net/openvswitch/flow_netlink.c | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index f7fc507d82ab..7c06106f5af5 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -309,6 +309,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9e3a2fae6a8f..f6bd93d5f435 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -369,6 +369,7 @@ static size_t key_attr_size(void) + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index f4c8daa73965..22c855fa0bc2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -346,6 +346,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_TTL] = 1, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + [OVS_TUNNEL_KEY_ATTR_OAM] = 0, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -390,6 +391,9 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_CSUM: tun_flags |= TUNNEL_CSUM; break; + case OVS_TUNNEL_KEY_ATTR_OAM: + tun_flags |= TUNNEL_OAM; + break; default: return -EINVAL; } @@ -431,21 +435,24 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; nla_nest_end(skb, nla); -- cgit v1.2.3 From f0b128c1e2cc33ad104daf0f51a51e34f7763c5f Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:31 -0700 Subject: openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure. Currently, the flow information that is matched for tunnels and the tunnel data passed around with packets is the same. However, as additional information is added this is not necessarily desirable, as in the case of pointers. This adds a new structure for tunnel metadata which currently contains only the existing struct. This change is purely internal to the kernel since the current OVS_KEY_ATTR_IPV4_TUNNEL is simply a compressed version of OVS_KEY_ATTR_TUNNEL that is translated at flow setup. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/actions.c | 5 +++-- net/openvswitch/datapath.h | 2 +- net/openvswitch/flow.c | 6 +++--- net/openvswitch/flow.h | 30 +++++++++++++++++------------- net/openvswitch/flow_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/openvswitch/vport-gre.c | 16 +++++++++------- net/openvswitch/vport-vxlan.c | 10 +++++----- net/openvswitch/vport.c | 6 +++--- net/openvswitch/vport.h | 2 +- 10 files changed, 74 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7c06106f5af5..6753032832e2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -294,7 +294,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ - OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ #endif __OVS_KEY_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 6932a42e41a2..006886dbee36 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -590,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; - case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->egress_tun_key = nla_data(nested_attr); + case OVS_KEY_ATTR_TUNNEL_INFO: + OVS_CB(skb)->egress_tun_info = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -778,6 +778,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); this_cpu_inc(exec_actions_level); + OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index ac3f3df96961..974135439c5c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -102,8 +102,8 @@ struct datapath { */ struct ovs_skb_cb { struct sw_flow *flow; + struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; - struct ovs_key_ipv4_tunnel *egress_tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 913bdc1a83c0..2924cb340868 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -642,12 +642,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_key) - memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); + if (tun_info) + memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); else memset(&key->tun_key, 0, sizeof(key->tun_key)); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 0f5db4ec565d..fe5a71b81c1f 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,20 +49,24 @@ struct ovs_key_ipv4_tunnel { u8 ipv4_ttl; } __packed __aligned(4); /* Minimize padding. */ -static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) +struct ovs_tunnel_info { + struct ovs_key_ipv4_tunnel tunnel; +}; + +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be64 tun_id, __be16 tun_flags) { - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; + tun_info->tunnel.tun_id = tun_id; + tun_info->tunnel.ipv4_src = iph->saddr; + tun_info->tunnel.ipv4_dst = iph->daddr; + tun_info->tunnel.ipv4_tos = iph->tos; + tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.tun_flags = tun_flags; /* clear struct padding. */ - memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); } struct sw_flow_key { @@ -190,8 +194,8 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, - struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, + struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 22c855fa0bc2..5d6194d9dadc 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1148,13 +1148,14 @@ out: return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); } -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len) { struct nlattr *a; a = reserve_sfa_size(sfa, nla_attr_size(len)); if (IS_ERR(a)) - return PTR_ERR(a); + return a; a->nla_type = attrtype; a->nla_len = nla_attr_size(len); @@ -1163,6 +1164,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in memcpy(nla_data(a), data, len); memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + return a; +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len); + if (IS_ERR(a)) + return PTR_ERR(a); + return 0; } @@ -1268,6 +1281,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct ovs_tunnel_info *tun_info; + struct nlattr *a; int err, start; ovs_match_init(&match, &key, NULL); @@ -1279,8 +1294,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*tun_info)); + if (IS_ERR(a)) + return PTR_ERR(a); + + tun_info = nla_data(a); + tun_info->tunnel = key.tun_key; + add_nested_action_end(*sfa, start); return err; @@ -1563,17 +1584,20 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) int err; switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); + err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); break; + } default: if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) return -EMSGSIZE; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 309cca6e816f..fe768bd600eb 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key; struct tnl_ptk_info tpi; + const struct ovs_key_ipv4_tunnel *tun_key; + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags)); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; } @@ -137,12 +139,12 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index f19539bb8adc..5fbff2c1ee49 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) /* Called with rcu_read_lock and BH disabled. */ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct vport *vport = vs->data; struct iphdr *iph; __be64 key; @@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); } static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) @@ -147,12 +147,12 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 5df8377fcfb1..3e50ee8a218c 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -432,7 +432,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) + struct ovs_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -445,9 +445,9 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_key = NULL; + OVS_CB(skb)->egress_tun_info = NULL; /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(tun_key, skb, &key); + error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 0efd62fef1e8..e28964aba021 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -207,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_key_ipv4_tunnel *); + struct ovs_tunnel_info *); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ -- cgit v1.2.3 From f5796684069e0c71c65bce6a6d4766114aec1396 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:33 -0700 Subject: openvswitch: Add support for Geneve tunneling. The Openvswitch implementation is completely agnostic to the options that are in use and can handle newly defined options without further work. It does this by simply matching on a byte array of options and allowing userspace to setup flows on this array. Signed-off-by: Jesse Gross Singed-off-by: Ansis Atteka Signed-off-by: Andy Zhou Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 21 ++-- include/uapi/linux/openvswitch.h | 2 + net/openvswitch/Kconfig | 11 ++ net/openvswitch/Makefile | 4 + net/openvswitch/datapath.c | 5 +- net/openvswitch/flow.c | 20 +++- net/openvswitch/flow.h | 20 +++- net/openvswitch/flow_netlink.c | 176 ++++++++++++++++++++++++----- net/openvswitch/vport-geneve.c | 236 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- net/openvswitch/vport.c | 3 + net/openvswitch/vport.h | 1 + 13 files changed, 461 insertions(+), 42 deletions(-) create mode 100644 net/openvswitch/vport-geneve.c (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a9ce1556d773..5bc6edeb7143 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -86,17 +86,18 @@ struct ip_tunnel { struct gro_cells gro_cells; }; -#define TUNNEL_CSUM __cpu_to_be16(0x01) -#define TUNNEL_ROUTING __cpu_to_be16(0x02) -#define TUNNEL_KEY __cpu_to_be16(0x04) -#define TUNNEL_SEQ __cpu_to_be16(0x08) -#define TUNNEL_STRICT __cpu_to_be16(0x10) -#define TUNNEL_REC __cpu_to_be16(0x20) -#define TUNNEL_VERSION __cpu_to_be16(0x40) -#define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_CSUM __cpu_to_be16(0x01) +#define TUNNEL_ROUTING __cpu_to_be16(0x02) +#define TUNNEL_KEY __cpu_to_be16(0x04) +#define TUNNEL_SEQ __cpu_to_be16(0x08) +#define TUNNEL_STRICT __cpu_to_be16(0x10) +#define TUNNEL_REC __cpu_to_be16(0x20) +#define TUNNEL_VERSION __cpu_to_be16(0x40) +#define TUNNEL_NO_KEY __cpu_to_be16(0x80) #define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) -#define TUNNEL_OAM __cpu_to_be16(0x0200) -#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800) struct tnl_ptk_info { __be16 flags; diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 6753032832e2..435eabc5ffaa 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -192,6 +192,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ + OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 6ecf491ad509..ba3bb8203b99 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_GENEVE + bool "Open vSwitch Geneve tunneling support" + depends on INET + depends on OPENVSWITCH + depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 3591cb5dae91..9a33a273c375 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) +openvswitch-y += vport-geneve.o +endif + ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) openvswitch-y += vport-vxlan.o endif diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 010125c48244..2e31d9e7f4dc 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -370,6 +370,7 @@ static size_t key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); - rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->egress_tun_info = NULL; OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2924cb340868..62db02ba36bc 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) int error; struct ethhdr *eth; + /* Flags are always used as part of stats */ + key->tp.flags = 0; + skb_reset_mac_header(skb); /* Link layer. We are guaranteed to have at least the 14 byte Ethernet @@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_info) + if (tun_info) { memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); - else + + if (tun_info->options) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + memcpy(GENEVE_OPTS(key, tun_info->options_len), + tun_info->options, tun_info->options_len); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); + } key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe5a71b81c1f..71813318c8c7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel { struct ovs_tunnel_info { struct ovs_key_ipv4_tunnel tunnel; + struct geneve_opt *options; + u8 options_len; }; +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define GENEVE_OPTS(flow_key, opt_len) \ + ((struct geneve_opt *)((flow_key)->tun_opts + \ + FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ + opt_len)) + static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, const struct iphdr *iph, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) { tun_info->tunnel.tun_id = tun_id; tun_info->tunnel.ipv4_src = iph->saddr; @@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, /* clear struct padding. */ memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; } struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 5d6194d9dadc..368f23307911 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match, } \ } while (0) -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range__(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; @@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, int rem; bool ttl = false; __be16 tun_flags = 0; + unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, [OVS_TUNNEL_KEY_ATTR_OAM] = 0, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type] != nla_len(a)) { + if (ovs_tunnel_key_lens[type] != nla_len(a) && + ovs_tunnel_key_lens[type] != -1) { OVS_NLERR("IPv4 tunnel attribute type has unexpected " " length (type=%d, length=%d, expected=%d).\n", type, nla_len(a), ovs_tunnel_key_lens[type]); @@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_OAM: tun_flags |= TUNNEL_OAM; break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + tun_flags |= TUNNEL_OPTIONS_PRESENT; + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", + nla_len(a), + sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", + match->key->tun_opts_len, + nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, + true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS( + (struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, + nla_data(a), nla_len(a), + is_mask); + break; default: + OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + type); return -EINVAL; } } @@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) +static int __ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) { - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; @@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; + if (tun_opts && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; - nla_nest_end(skb, nla); return 0; } +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { @@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask) && - ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) - goto nla_put_failure; + if ((swkey->tun_key.ipv4_dst || is_mask)) { + const struct geneve_opt *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = GENEVE_OPTS(output, swkey->tun_opts_len); + + if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len)) + goto nla_put_failure; + } if (swkey->phy.in_port == DP_MAX_PORTS) { if (is_mask && (output->phy.in_port == 0xffff)) @@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (err) return err; + if (key.tun_opts_len) { + struct geneve_opt *option = GENEVE_OPTS(&key, + key.tun_opts_len); + int opts_len = key.tun_opts_len; + bool crit_opt = false; + + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + }; + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); if (start < 0) return start; a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info)); + sizeof(*tun_info) + key.tun_opts_len); if (IS_ERR(a)) return PTR_ERR(a); tun_info = nla_data(a); tun_info->tunnel = key.tun_key; + tun_info->options_len = key.tun_opts_len; + + if (tun_info->options_len) { + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); + tun_info->options = (struct geneve_opt *)(tun_info + 1); + } else { + tun_info->options = NULL; + } add_nested_action_end(*sfa, start); @@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) return -EMSGSIZE; err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, - nla_data(ovs_key)); + tun_info->options_len ? + tun_info->options : NULL, + tun_info->options_len); if (err) return err; nla_nest_end(skb, start); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000000..5572d482f285 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/** + * struct geneve_port - Keeps track of open UDP ports + * @sock: The socket created for this port number. + * @name: vport name. + */ +struct geneve_port { + struct geneve_sock *gs; + char name[IFNAMSIZ]; +}; + +static LIST_HEAD(geneve_ports); + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit VNI to 64 bit tunnel ID. */ +static __be64 vni_to_tunnel_id(__u8 *vni) +{ +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct vport *vport = gs->rcv_data; + struct genevehdr *geneveh = geneve_hdr(skb); + int opts_len; + struct ovs_tunnel_info tun_info; + __be64 key; + __be16 flags; + + opts_len = geneveh->opt_len * 4; + + flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT | + (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | + (geneveh->oam ? TUNNEL_OAM : 0) | + (geneveh->critical ? TUNNEL_CRIT_OPT : 0); + + key = vni_to_tunnel_id(geneveh->vni); + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + geneveh->options, opts_len); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 sport; + + sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport); + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport)) + return -EMSGSIZE; + return 0; +} + +static void geneve_tnl_destroy(struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + geneve_sock_release(geneve_port->gs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct geneve_sock *gs; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + strncpy(geneve_port->name, parms->name, IFNAMSIZ); + + gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); + if (IS_ERR(gs)) { + ovs_vport_free(vport); + return (void *)gs; + } + geneve_port->gs = gs; + + return vport; +error: + return ERR_PTR(err); +} + +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *tun_info; + struct net *net = ovs_dp_get_net(vport->dp); + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport; + struct rtable *rt; + struct flowi4 fl; + u8 vni[3]; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + tunnel_id_to_vni(tun_key->tun_id, vni); + skb->ignore_df = 1; + + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, + tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->ipv4_ttl, df, sport, dport, + tun_key->tun_flags, vni, + tun_info->options_len, (u8 *)tun_info->options, + false); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *geneve_get_name(const struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + return geneve_port->name; +} + +const struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_tnl_create, + .destroy = geneve_tnl_destroy, + .get_name = geneve_get_name, + .get_options = geneve_get_options, + .send = geneve_tnl_send, +}; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index fe768bd600eb..108b82da2fd9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -106,7 +106,7 @@ static int gre_rcv(struct sk_buff *skb, key = key_to_tunnel_id(tpi->key, tpi->seq); ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 5fbff2c1ee49..2735e01dca73 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -66,7 +66,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); ovs_vport_receive(vport, skb, &tun_info); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3e50ee8a218c..53001b020ca7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_VXLAN &ovs_vxlan_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_GENEVE + &ovs_geneve_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e28964aba021..8942125de3a6 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -215,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; extern const struct vport_ops ovs_vxlan_vport_ops; +extern const struct vport_ops ovs_geneve_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3 From f2600cf02b5b59aaee082c3485b7f01fc7f7b70c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Oct 2014 10:11:31 -0700 Subject: net: sched: avoid costly atomic operation in fq_dequeue() Standard qdisc API to setup a timer implies an atomic operation on every packet dequeue : qdisc_unthrottled() It turns out this is not really needed for FQ, as FQ has no concept of global qdisc throttling, being a qdisc handling many different flows, some of them can be throttled, while others are not. Fix is straightforward : add a 'bool throttle' to qdisc_watchdog_schedule_ns(), and remove calls to qdisc_unthrottled() in sch_fq. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 ++-- net/sched/sch_api.c | 5 +++-- net/sched/sch_fq.c | 6 ++---- net/sched/sch_tbf.c | 3 ++- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index e4b3c828c1c2..27a33833ff4a 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -65,12 +65,12 @@ struct qdisc_watchdog { }; void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle); static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { - qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); + qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires), true); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c79a226cc25c..2cf61b3e633c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -594,13 +594,14 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - qdisc_throttled(wd->qdisc); + if (throttle) + qdisc_throttled(wd->qdisc); hrtimer_start(&wd->timer, ns_to_ktime(expires), diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index c9b9fcb53206..cbd7e1fd23b4 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -377,7 +377,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); q->inactive_flows--; - qdisc_unthrottled(sch); } /* Note: this overwrites f->age */ @@ -385,7 +384,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(f == &q->internal)) { q->stat_internal_packets++; - qdisc_unthrottled(sch); } sch->q.qlen++; @@ -433,7 +431,8 @@ begin: if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow); + q->time_next_delayed_flow, + false); return NULL; } } @@ -495,7 +494,6 @@ begin: } out: qdisc_bstats_update(sch, skb); - qdisc_unthrottled(sch); return skb; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 77edffe329c4..a4afde14e865 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -268,7 +268,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) } qdisc_watchdog_schedule_ns(&q->watchdog, - now + max_t(long, -toks, -ptoks)); + now + max_t(long, -toks, -ptoks), + true); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, -- cgit v1.2.3 From 7dfa4b414d4eec8da56e44fb2b4aea3e549b092f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 12:35:09 +0300 Subject: net/mlx4_en: Code cleanups in tx path - Remove unused variable ring->poll_cnt - No need to set some fields if using blueflame - Add missing const's - Use unlikely - Remove unneeded new line - Make some comments more precise - struct mlx4_bf @offset field reduced to unsigned int to save space Signed-off-by: Eric Dumazet Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 49 +++++++++++++++------------- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 - include/linux/mlx4/device.h | 2 +- 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 0c501253fdab..eaf23eb7266a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -191,7 +191,6 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; - ring->poll_cnt = 0; memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->buf, 0, ring->buf_size); @@ -512,7 +511,8 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, return ring->buf + index * TXBB_SIZE; } -static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag) +static bool is_inline(int inline_thold, const struct sk_buff *skb, + void **pfrag) { void *ptr; @@ -535,7 +535,7 @@ static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag) return 0; } -static int inline_size(struct sk_buff *skb) +static int inline_size(const struct sk_buff *skb) { if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) <= MLX4_INLINE_ALIGN) @@ -546,7 +546,8 @@ static int inline_size(struct sk_buff *skb) sizeof(struct mlx4_wqe_inline_seg), 16); } -static int get_real_size(struct sk_buff *skb, struct net_device *dev, +static int get_real_size(const struct sk_buff *skb, + struct net_device *dev, int *lso_header_size) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -581,8 +582,10 @@ static int get_real_size(struct sk_buff *skb, struct net_device *dev, return real_size; } -static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb, - int real_size, u16 *vlan_tag, int tx_ind, void *fragptr) +static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, + const struct sk_buff *skb, + int real_size, u16 *vlan_tag, + int tx_ind, void *fragptr) { struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; @@ -642,7 +645,8 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, return fallback(dev, skb) % rings_p_up + up * rings_p_up; } -static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) +static void mlx4_bf_copy(void __iomem *dst, const void *src, + unsigned int bytecnt) { __iowrite64_copy(dst, src, bytecnt / 8); } @@ -736,11 +740,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) tx_info->skb = skb; tx_info->nr_txbb = nr_txbb; + data = &tx_desc->data; if (lso_header_size) data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, DS_SIZE)); - else - data = &tx_desc->data; /* valid only for none inline segments */ tx_info->data_offset = (void *)data - (void *)tx_desc; @@ -753,9 +756,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) if (is_inline(ring->inline_thold, skb, &fragptr)) { tx_info->inl = 1; } else { - /* Map fragments */ + /* Map fragments if any */ for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { - struct skb_frag_struct *frag; + const struct skb_frag_struct *frag; dma_addr_t dma; frag = &skb_shinfo(skb)->frags[i]; @@ -772,7 +775,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) --data; } - /* Map linear part */ + /* Map linear part if needed */ if (tx_info->linear) { u32 byte_count = skb_headlen(skb) - lso_header_size; dma_addr_t dma; @@ -795,18 +798,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) * For timestamping add flag to skb_shinfo and * set flag for further reference */ - if (ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && - skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && + shinfo->tx_flags & SKBTX_HW_TSTAMP)) { + shinfo->tx_flags |= SKBTX_IN_PROGRESS; tx_info->ts_requested = 1; } /* Prepare ctrl segement apart opcode+ownership, which depends on * whether LSO is used */ - tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * - !!vlan_tx_tag_present(skb); - tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | @@ -852,7 +851,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); ring->packets++; - } ring->bytes += tx_info->nr_bytes; netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes); @@ -874,7 +872,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ring->prod += nr_txbb; /* If we used a bounce buffer then copy descriptor back into place */ - if (bounce) + if (unlikely(bounce)) tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); skb_tx_timestamp(skb); @@ -894,13 +892,18 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) wmb(); - mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, - desc_size); + mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl, + desc_size); wmb(); ring->bf.offset ^= ring->bf.buf_size; } else { + tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * + !!vlan_tx_tag_present(skb); + tx_desc->ctrl.fence_size = real_size; + /* Ensure new descriptor hits memory * before setting ownership of this descriptor to HW */ diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 84c9d5dbfa4f..e54b653de3d4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -263,7 +263,6 @@ struct mlx4_en_tx_ring { u32 buf_size; u32 doorbell_qpn; void *buf; - u16 poll_cnt; struct mlx4_en_tx_info *tx_info; u8 *bounce_buf; u8 queue_index; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index b2f8ab9a57c4..37e4404d0227 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -583,7 +583,7 @@ struct mlx4_uar { }; struct mlx4_bf { - unsigned long offset; + unsigned int offset; int buf_size; struct mlx4_uar *uar; void __iomem *reg; -- cgit v1.2.3 From 1255a5055449781a92076fc5429952f2b33cf309 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 12:35:21 +0300 Subject: ethtool: Ethtool parameter to dynamically change tx_copybreak Use new ethtool [sg]et_tunable() to set tx_copybread (inline threshold) Signed-off-by: Eric Dumazet Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 1 + net/core/ethtool.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7a364f2f3d3f..99b43056a6fe 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -212,6 +212,7 @@ struct ethtool_value { enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, }; enum tunable_type_id { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 27e61b886520..1600aa24d36b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1625,6 +1625,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; -- cgit v1.2.3 From fcbeb976d7ce783fd58e63e61c196d9a8912b3be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 10:11:27 -0700 Subject: net: introduce netdevice gso_min_segs attribute Some TSO engines might have a too heavy setup cost, that impacts performance on hosts sending small bursts (2 MSS per packet). This patch adds a device gso_min_segs, allowing drivers to set a minimum segment size for TSO packets, according to the NIC performance. Tested on a mlx4 NIC, this allows to get a ~110% increase of throughput when sending 2 MSS per packet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- net/core/dev.c | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 22d54b9b700d..2df86f50261c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1416,6 +1416,8 @@ enum netdev_priv_flags { * @gso_max_size: Maximum size of generic segmentation offload * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @gso_min_segs: Minimum number of segments that can be passed to the + * NIC for GSO * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -1666,7 +1668,7 @@ struct net_device { unsigned int gso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; - + u16 gso_min_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif diff --git a/net/core/dev.c b/net/core/dev.c index 7d5691cc1f47..c1a4de275576 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { + const struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { @@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) } features = netdev_intersect_features(features, - skb->dev->vlan_features | + dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); @@ -6661,6 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); -- cgit v1.2.3 From 82a470f1119eb7d2e4941b915bf9cd6fd8d54494 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 5 Oct 2014 21:27:53 -0700 Subject: net: sched: remove tcf_proto from ematch calls This removes the tcf_proto argument from the ematch code paths that only need it to reference the net namespace. This allows simplifying qdisc code paths especially when we need to tear down the ematch from an RCU callback. In this case we can not guarentee that the tcf_proto structure is still valid. Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 10 +++++----- net/sched/cls_basic.c | 2 +- net/sched/cls_cgroup.c | 4 ++-- net/sched/cls_flow.c | 4 ++-- net/sched/em_canid.c | 4 ++-- net/sched/em_ipset.c | 7 +++---- net/sched/em_meta.c | 4 ++-- net/sched/em_nbyte.c | 2 +- net/sched/em_text.c | 4 ++-- net/sched/ematch.c | 10 ++++++---- 10 files changed, 26 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index ef44ad9a6426..bc49967e1a68 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -166,6 +166,7 @@ struct tcf_ematch { unsigned int datalen; u16 matchid; u16 flags; + struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) @@ -229,12 +230,11 @@ struct tcf_ematch_tree { struct tcf_ematch_ops { int kind; int datalen; - int (*change)(struct tcf_proto *, void *, + int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); - void (*destroy)(struct tcf_proto *, - struct tcf_ematch *); + void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; @@ -244,7 +244,7 @@ int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); -void tcf_em_tree_destroy(struct tcf_proto *, struct tcf_ematch_tree *); +void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); @@ -301,7 +301,7 @@ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) -#define tcf_em_tree_destroy(tp, t) do { (void)(t); } while(0) +#define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_change(tp, dst, src) do { } while(0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index fe20826a79e7..90647a8af8ca 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -95,7 +95,7 @@ static void basic_delete_filter(struct rcu_head *head) tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(&f->exts); - tcf_em_tree_destroy(tp, &f->ematches); + tcf_em_tree_destroy(&f->ematches); kfree(f); } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3409f168225f..2f77a89655dc 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -87,7 +87,7 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root) rcu); tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(head->tp, &head->ematches); + tcf_em_tree_destroy(&head->ematches); kfree(head); } @@ -157,7 +157,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) if (head) { tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(tp, &head->ematches); + tcf_em_tree_destroy(&head->ematches); RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index f18d27f7b5f2..a5d2b20db560 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -355,7 +355,7 @@ static void flow_destroy_filter(struct rcu_head *head) del_timer_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); - tcf_em_tree_destroy(f->tp, &f->ematches); + tcf_em_tree_destroy(&f->ematches); kfree(f); } @@ -530,7 +530,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return 0; err2: - tcf_em_tree_destroy(tp, &t); + tcf_em_tree_destroy(&t); kfree(fnew); err1: tcf_exts_destroy(&e); diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index 7c292d474f47..ddd883ca55b2 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -120,7 +120,7 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, return match; } -static int em_canid_change(struct tcf_proto *tp, void *data, int len, +static int em_canid_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ @@ -183,7 +183,7 @@ static int em_canid_change(struct tcf_proto *tp, void *data, int len, return 0; } -static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_canid_destroy(struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index 527aeb7a3ff0..5b4a4efe468c 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -19,12 +19,11 @@ #include #include -static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, +static int em_ipset_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct xt_set_info *set = data; ip_set_id_t index; - struct net *net = dev_net(qdisc_dev(tp->q)); if (data_len != sizeof(*set)) return -EINVAL; @@ -42,11 +41,11 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, return -ENOMEM; } -static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +static void em_ipset_destroy(struct tcf_ematch *em) { const struct xt_set_info *set = (const void *) em->data; if (set) { - ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); + ip_set_nfnl_put(em->net, set->index); kfree((void *) em->data); } } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 9b8c0b0e60d7..c8f8c399b99a 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -856,7 +856,7 @@ static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, }; -static int em_meta_change(struct tcf_proto *tp, void *data, int len, +static int em_meta_change(struct net *net, void *data, int len, struct tcf_ematch *m) { int err; @@ -908,7 +908,7 @@ errout: return err; } -static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_meta_destroy(struct tcf_ematch *m) { if (m) meta_delete((struct meta_match *) m->data); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index a3bed07a008b..df3110d69585 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -23,7 +23,7 @@ struct nbyte_data { char pattern[0]; }; -static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, +static int em_nbyte_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct tcf_em_nbyte *nbyte = data; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 15d353d2e4be..f03c3de16c27 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -45,7 +45,7 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; } -static int em_text_change(struct tcf_proto *tp, void *data, int len, +static int em_text_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct text_match *tm; @@ -100,7 +100,7 @@ retry: return 0; } -static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_text_destroy(struct tcf_ematch *m) { if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) textsearch_destroy(EM_TEXT_PRIV(m)->config); diff --git a/net/sched/ematch.c b/net/sched/ematch.c index ad57f4444b9c..8250c36543d8 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -178,6 +178,7 @@ static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); + struct net *net = dev_net(qdisc_dev(tp->q)); if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; @@ -240,7 +241,7 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { - err = em->ops->change(tp, data, data_len, em); + err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { @@ -271,6 +272,7 @@ static int tcf_em_validate(struct tcf_proto *tp, em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->datalen = data_len; + em->net = net; err = 0; errout: @@ -378,7 +380,7 @@ errout: return err; errout_abort: - tcf_em_tree_destroy(tp, tree); + tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); @@ -393,7 +395,7 @@ EXPORT_SYMBOL(tcf_em_tree_validate); * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ -void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) +void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; @@ -405,7 +407,7 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) if (em->ops) { if (em->ops->destroy) - em->ops->destroy(tp, em); + em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); -- cgit v1.2.3 From 94b2cfe02bfe3f1918d91bd6f498e308c5605cbc Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:34 +0200 Subject: ipv6: minor fib6 cleanups like type safety, bool conversion, inline removal Also renamed struct fib6_walker_t to fib6_walker and enum fib_walk_state_t to fib6_walk_state as recommended by Cong Wang. Cc: Cong Wang Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 18 ++++++++++--- net/ipv6/ip6_fib.c | 71 ++++++++++++++++++++++----------------------------- 2 files changed, 45 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cf485f9aa563..9221bf4c64f7 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -202,15 +202,25 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -struct fib6_walker_t { +enum fib6_walk_state { +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; struct rt6_info *leaf; - unsigned char state; - unsigned char prune; + enum fib6_walk_state state; + bool prune; unsigned int skip; unsigned int count; - int (*func)(struct fib6_walker_t *); + int (*func)(struct fib6_walker *); void *args; }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 97b9fa8de377..e8d7465b1597 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -46,18 +46,8 @@ static struct kmem_cache *fib6_node_kmem __read_mostly; -enum fib_walk_state_t { -#ifdef CONFIG_IPV6_SUBTREES - FWS_S, -#endif - FWS_L, - FWS_R, - FWS_C, - FWS_U -}; - -struct fib6_cleaner_t { - struct fib6_walker_t w; +struct fib6_cleaner { + struct fib6_walker w; struct net *net; int (*func)(struct rt6_info *, void *arg); void *arg; @@ -74,8 +64,8 @@ static DEFINE_RWLOCK(fib6_walker_lock); static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker_t *w); -static int fib6_walk_continue(struct fib6_walker_t *w); +static int fib6_walk(struct fib6_walker *w); +static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the @@ -91,20 +81,21 @@ static void fib6_gc_timer_cb(unsigned long arg); static LIST_HEAD(fib6_walkers); #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) -static inline void fib6_walker_link(struct fib6_walker_t *w) +static void fib6_walker_link(struct fib6_walker *w) { write_lock_bh(&fib6_walker_lock); list_add(&w->lh, &fib6_walkers); write_unlock_bh(&fib6_walker_lock); } -static inline void fib6_walker_unlink(struct fib6_walker_t *w) +static void fib6_walker_unlink(struct fib6_walker *w) { write_lock_bh(&fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&fib6_walker_lock); } -static __inline__ u32 fib6_new_sernum(void) + +static u32 fib6_new_sernum(void) { u32 n = ++rt_sernum; if ((__s32)n <= 0) @@ -128,7 +119,7 @@ static __inline__ u32 fib6_new_sernum(void) # define BITOP_BE32_SWIZZLE 0 #endif -static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) +static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* @@ -142,7 +133,7 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static __inline__ struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(void) { struct fib6_node *fn; @@ -151,12 +142,12 @@ static __inline__ struct fib6_node *node_alloc(void) return fn; } -static __inline__ void node_free(struct fib6_node *fn) +static void node_free(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } -static __inline__ void rt6_release(struct rt6_info *rt) +static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) dst_free(&rt->dst); @@ -267,7 +258,7 @@ static void __net_init fib6_tables_init(struct net *net) #endif -static int fib6_dump_node(struct fib6_walker_t *w) +static int fib6_dump_node(struct fib6_walker *w) { int res; struct rt6_info *rt; @@ -287,7 +278,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void *)cb->args[2]; + struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { @@ -310,7 +301,7 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { - struct fib6_walker_t *w; + struct fib6_walker *w; int res; w = (void *)cb->args[2]; @@ -355,7 +346,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int h, s_h; unsigned int e = 0, s_e; struct rt6_rtnl_dump_arg arg; - struct fib6_walker_t *w; + struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; @@ -627,7 +618,7 @@ insert_above: return ln; } -static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +static bool rt6_qualify_for_ecmp(struct rt6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; @@ -820,7 +811,7 @@ add: return 0; } -static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) +static void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) @@ -1174,7 +1165,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, int children; int nstate; struct fib6_node *child, *pn; - struct fib6_walker_t *w; + struct fib6_walker *w; int iter = 0; for (;;) { @@ -1276,7 +1267,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, struct nl_info *info) { - struct fib6_walker_t *w; + struct fib6_walker *w; struct rt6_info *rt = *rtp; struct net *net = info->nl_net; @@ -1414,7 +1405,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * <0 -> walk is terminated by an error. */ -static int fib6_walk_continue(struct fib6_walker_t *w) +static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn; @@ -1498,7 +1489,7 @@ skip: } } -static int fib6_walk(struct fib6_walker_t *w) +static int fib6_walk(struct fib6_walker *w) { int res; @@ -1512,11 +1503,11 @@ static int fib6_walk(struct fib6_walker_t *w) return res; } -static int fib6_clean_node(struct fib6_walker_t *w) +static int fib6_clean_node(struct fib6_walker *w) { int res; struct rt6_info *rt; - struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, }; @@ -1554,9 +1545,9 @@ static int fib6_clean_node(struct fib6_walker_t *w) static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) + bool prune, void *arg) { - struct fib6_cleaner_t c; + struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; @@ -1583,7 +1574,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, 0, arg); + func, false, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1602,7 +1593,7 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); + fib6_clean_tree(net, fn, fib6_prune_clone, true, NULL); } static int fib6_update_sernum(struct rt6_info *rt, void *arg) @@ -1828,7 +1819,7 @@ void fib6_gc_cleanup(void) struct ipv6_route_iter { struct seq_net_private p; - struct fib6_walker_t w; + struct fib6_walker w; loff_t skip; struct fib6_table *tbl; __u32 sernum; @@ -1859,7 +1850,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) return 0; } -static int ipv6_route_yield(struct fib6_walker_t *w) +static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; @@ -1980,7 +1971,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { - struct fib6_walker_t *w = &iter->w; + struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } -- cgit v1.2.3 From 42b18706469a02c1f84375ac0ee2f30f28d85d4c Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:35 +0200 Subject: ipv6: make rt_sernum atomic and serial number fields ordinary ints Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 9221bf4c64f7..8eea35d32a75 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -64,7 +64,7 @@ struct fib6_node { __u16 fn_bit; /* bit key */ __u16 fn_flags; - __u32 fn_sernum; + int fn_sernum; struct rt6_info *rr_ptr; }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e8d7465b1597..332f1e0ff8e2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -74,7 +74,7 @@ static int fib6_walk_continue(struct fib6_walker *w); * result of redirects, path MTU changes, etc. */ -static __u32 rt_sernum; +static atomic_t rt_sernum = ATOMIC_INIT(1); static void fib6_gc_timer_cb(unsigned long arg); @@ -95,12 +95,15 @@ static void fib6_walker_unlink(struct fib6_walker *w) write_unlock_bh(&fib6_walker_lock); } -static u32 fib6_new_sernum(void) +static int fib6_new_sernum(void) { - u32 n = ++rt_sernum; - if ((__s32)n <= 0) - rt_sernum = n = 1; - return n; + int new, old; + + do { + old = atomic_read(&rt_sernum); + new = old < INT_MAX ? old + 1 : 1; + } while (atomic_cmpxchg(&rt_sernum, old, new) != old); + return new; } /* @@ -421,7 +424,7 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, struct rt6key *key; int bit; __be32 dir = 0; - __u32 sernum = fib6_new_sernum(); + int sernum = fib6_new_sernum(); RT6_TRACE("fib6_add_1\n"); @@ -1598,7 +1601,7 @@ static void fib6_prune_clones(struct net *net, struct fib6_node *fn) static int fib6_update_sernum(struct rt6_info *rt, void *arg) { - __u32 sernum = *(__u32 *)arg; + int sernum = *(int *)arg; if (rt->rt6i_node && rt->rt6i_node->fn_sernum != sernum) @@ -1609,7 +1612,7 @@ static int fib6_update_sernum(struct rt6_info *rt, void *arg) static void fib6_flush_trees(struct net *net) { - __u32 new_sernum = fib6_new_sernum(); + int new_sernum = fib6_new_sernum(); fib6_clean_all(net, fib6_update_sernum, &new_sernum); } @@ -1822,7 +1825,7 @@ struct ipv6_route_iter { struct fib6_walker w; loff_t skip; struct fib6_table *tbl; - __u32 sernum; + int sernum; }; static int ipv6_route_seq_show(struct seq_file *seq, void *v) -- cgit v1.2.3 From 812918c464eca0e8c145f975932ca5020e9c05cb Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:37 +0200 Subject: ipv6: make fib6 serial number per namespace Try to reduce number of possible fn_sernum mutation by constraining them to their namespace. Also remove rt_genid which I forgot to remove in 705f1c869d577c ("ipv6: remove rt6i_genid"). Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/ip6_fib.c | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index eade27adecf3..69ae41f2098c 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -76,7 +76,7 @@ struct netns_ipv6 { #endif #endif atomic_t dev_addr_genid; - atomic_t rt_genid; + atomic_t fib6_sernum; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 34f726f59814..e8c4400f23e9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -766,7 +766,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = 0; - atomic_set(&net->ipv6.rt_genid, 0); + atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); if (err) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index be9cb09b05f7..6f9beb1f2861 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -74,8 +74,6 @@ static int fib6_walk_continue(struct fib6_walker *w); * result of redirects, path MTU changes, etc. */ -static atomic_t rt_sernum = ATOMIC_INIT(1); - static void fib6_gc_timer_cb(unsigned long arg); static LIST_HEAD(fib6_walkers); @@ -95,14 +93,15 @@ static void fib6_walker_unlink(struct fib6_walker *w) write_unlock_bh(&fib6_walker_lock); } -static int fib6_new_sernum(void) +static int fib6_new_sernum(struct net *net) { int new, old; do { - old = atomic_read(&rt_sernum); + old = atomic_read(&net->ipv6.fib6_sernum); new = old < INT_MAX ? old + 1 : 1; - } while (atomic_cmpxchg(&rt_sernum, old, new) != old); + } while (atomic_cmpxchg(&net->ipv6.fib6_sernum, + old, new) != old); return new; } @@ -841,7 +840,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, int err = -ENOMEM; int allow_create = 1; int replace_required = 0; - int sernum = fib6_new_sernum(); + int sernum = fib6_new_sernum(info->nl_net); if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1612,7 +1611,7 @@ static int fib6_update_sernum(struct rt6_info *rt, void *arg) static void fib6_flush_trees(struct net *net) { - int new_sernum = fib6_new_sernum(); + int new_sernum = fib6_new_sernum(net); fib6_clean_all(net, fib6_update_sernum, &new_sernum); } -- cgit v1.2.3 From 7c5df8fa1921450d2210db9928f43cf4f414982c Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 6 Oct 2014 15:15:14 -0700 Subject: openvswitch: fix a compilation error when CONFIG_INET is not setW! Fix a openvswitch compilation error when CONFIG_INET is not set: ===================================================== In file included from include/net/geneve.h:4:0, from net/openvswitch/flow_netlink.c:45: include/net/udp_tunnel.h: In function 'udp_tunnel_handle_offloads': >> include/net/udp_tunnel.h:100:2: error: implicit declaration of function 'iptunnel_handle_offloads' [-Werror=implicit-function-declaration] >> return iptunnel_handle_offloads(skb, udp_csum, type); >> ^ >> >> include/net/udp_tunnel.h:100:2: warning: return makes pointer from integer without a cast >> >> cc1: some warnings being treated as errors ===================================================== Reported-by: kbuild test robot Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- drivers/net/Kconfig | 1 - include/net/geneve.h | 36 +++++++++++++++++++++--------------- net/ipv4/Kconfig | 29 +++++++++++++++-------------- 3 files changed, 36 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index c6f6f69f8961..4706386b7d34 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -147,7 +147,6 @@ config MACVTAP config VXLAN tristate "Virtual eXtensible Local Area Network (VXLAN)" depends on INET - select NET_IP_TUNNEL select NET_UDP_TUNNEL ---help--- This allows one to create vxlan virtual interfaces that provide diff --git a/include/net/geneve.h b/include/net/geneve.h index ce98865318bf..112132cf8e2e 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -1,22 +1,10 @@ #ifndef __NET_GENEVE_H #define __NET_GENEVE_H 1 +#ifdef CONFIG_INET #include +#endif -struct geneve_sock; - -typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); - -struct geneve_sock { - struct hlist_node hlist; - geneve_rcv_t *rcv; - void *rcv_data; - struct work_struct del_work; - struct socket *sock; - struct rcu_head rcu; - atomic_t refcnt; - struct udp_offload udp_offloads; -}; /* Geneve Header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -74,6 +62,22 @@ struct genevehdr { struct geneve_opt options[]; }; +#ifdef CONFIG_INET +struct geneve_sock; + +typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); + +struct geneve_sock { + struct hlist_node hlist; + geneve_rcv_t *rcv; + void *rcv_data; + struct work_struct del_work; + struct socket *sock; + struct rcu_head rcu; + atomic_t refcnt; + struct udp_offload udp_offloads; +}; + #define GENEVE_VER 0 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) @@ -88,4 +92,6 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, bool xnet); -#endif +#endif /*ifdef CONFIG_INET */ + +#endif /*ifdef__NET_GENEVE_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c2035447e649..e682b48e0709 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -309,6 +309,7 @@ config NET_IPVTI config NET_UDP_TUNNEL tristate + select NET_IP_TUNNEL default n config NET_FOU @@ -321,6 +322,20 @@ config NET_FOU network mechanisms and optimizations for UDP (such as ECMP and RSS) can be leveraged to provide better service. +config GENEVE + tristate "Generic Network Virtualization Encapsulation (Geneve)" + depends on INET + select NET_UDP_TUNNEL + ---help--- + This allows one to create Geneve virtual interfaces that provide + Layer 2 Networks over Layer 3 Networks. Geneve is often used + to tunnel virtual network infrastructure in virtualized environments. + For more information see: + http://tools.ietf.org/html/draft-gross-geneve-01 + + To compile this driver as a module, choose M here: the module + + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO @@ -453,20 +468,6 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" - depends on INET - select NET_IP_TUNNEL - select NET_UDP_TUNNEL - ---help--- - This allows one to create Geneve virtual interfaces that provide - Layer 2 Networks over Layer 3 Networks. Geneve is often used - to tunnel virtual network infrastructure in virtualized environments. - For more information see: - http://tools.ietf.org/html/draft-gross-geneve-01 - - To compile this driver as a module, choose M here: the module - config TCP_CONG_CUBIC tristate "CUBIC TCP" default y -- cgit v1.2.3 From fd2ef0ba3071c92ac6272ab22ea3f2b16d88a4eb Mon Sep 17 00:00:00 2001 From: Petri Gynther Date: Mon, 6 Oct 2014 11:38:30 -0700 Subject: net: phy: adjust fixed_phy_register() return value Adjust fixed_phy_register() to return struct phy_device *, so that it becomes easy to use fixed PHYs without device tree support: phydev = fixed_phy_register(PHY_POLL, &fixed_phy_status, NULL); fixed_phy_set_link_update(phydev, fixed_phy_link_update); phy_connect_direct(netdev, phydev, handler_fn, phy_interface); This change is a prerequisite for modifying bcmgenet driver to work without a device tree on Broadcom's MIPS-based 7xxx platforms. Signed-off-by: Petri Gynther Signed-off-by: David S. Miller --- drivers/net/phy/fixed.c | 16 ++++++++-------- drivers/of/of_mdio.c | 7 +++++-- include/linux/phy_fixed.h | 14 +++++++------- 3 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/fixed.c b/drivers/net/phy/fixed.c index 5b19fbbda6d4..47872caa0081 100644 --- a/drivers/net/phy/fixed.c +++ b/drivers/net/phy/fixed.c @@ -233,9 +233,9 @@ EXPORT_SYMBOL_GPL(fixed_phy_del); static int phy_fixed_addr; static DEFINE_SPINLOCK(phy_fixed_addr_lock); -int fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; struct phy_device *phy; @@ -246,19 +246,19 @@ int fixed_phy_register(unsigned int irq, spin_lock(&phy_fixed_addr_lock); if (phy_fixed_addr == PHY_MAX_ADDR) { spin_unlock(&phy_fixed_addr_lock); - return -ENOSPC; + return ERR_PTR(-ENOSPC); } phy_addr = phy_fixed_addr++; spin_unlock(&phy_fixed_addr_lock); ret = fixed_phy_add(PHY_POLL, phy_addr, status); if (ret < 0) - return ret; + return ERR_PTR(ret); phy = get_phy_device(fmb->mii_bus, phy_addr, false); if (!phy || IS_ERR(phy)) { fixed_phy_del(phy_addr); - return -EINVAL; + return ERR_PTR(-EINVAL); } of_node_get(np); @@ -269,10 +269,10 @@ int fixed_phy_register(unsigned int irq, phy_device_free(phy); of_node_put(np); fixed_phy_del(phy_addr); - return ret; + return ERR_PTR(ret); } - return 0; + return phy; } static int __init fixed_mdio_bus_init(void) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index a85d80012993..1bd43053b8c7 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -286,6 +286,7 @@ int of_phy_register_fixed_link(struct device_node *np) struct device_node *fixed_link_node; const __be32 *fixed_link_prop; int len; + struct phy_device *phy; /* New binding */ fixed_link_node = of_get_child_by_name(np, "fixed-link"); @@ -299,7 +300,8 @@ int of_phy_register_fixed_link(struct device_node *np) status.asym_pause = of_property_read_bool(fixed_link_node, "asym-pause"); of_node_put(fixed_link_node); - return fixed_phy_register(PHY_POLL, &status, np); + phy = fixed_phy_register(PHY_POLL, &status, np); + return IS_ERR(phy) ? PTR_ERR(phy) : 0; } /* Old binding */ @@ -310,7 +312,8 @@ int of_phy_register_fixed_link(struct device_node *np) status.speed = be32_to_cpu(fixed_link_prop[2]); status.pause = be32_to_cpu(fixed_link_prop[3]); status.asym_pause = be32_to_cpu(fixed_link_prop[4]); - return fixed_phy_register(PHY_POLL, &status, np); + phy = fixed_phy_register(PHY_POLL, &status, np); + return IS_ERR(phy) ? PTR_ERR(phy) : 0; } return -ENODEV; diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 941138664c1d..f2ca1b459377 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -14,9 +14,9 @@ struct device_node; #ifdef CONFIG_FIXED_PHY extern int fixed_phy_add(unsigned int irq, int phy_id, struct fixed_phy_status *status); -extern int fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np); +extern struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np); extern void fixed_phy_del(int phy_addr); extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, @@ -27,11 +27,11 @@ static inline int fixed_phy_add(unsigned int irq, int phy_id, { return -ENODEV; } -static inline int fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +static inline struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np) { - return -ENODEV; + return ERR_PTR(-ENODEV); } static inline int fixed_phy_del(int phy_addr) { -- cgit v1.2.3 From 0287587884b15041203b3a362d485e1ab1f24445 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 18:38:35 -0700 Subject: net: better IFF_XMIT_DST_RELEASE support Testing xmit_more support with netperf and connected UDP sockets, I found strange dst refcount false sharing. Current handling of IFF_XMIT_DST_RELEASE is not optimal. Dropping dst in validate_xmit_skb() is certainly too late in case packet was queued by cpu X but dequeued by cpu Y The logical point to take care of drop/force is in __dev_queue_xmit() before even taking qdisc lock. As Julian Anastasov pointed out, need for skb_dst() might come from some packet schedulers or classifiers. This patch adds new helper to cleanly express needs of various drivers or qdiscs/classifiers. Drivers that need skb_dst() in their ndo_start_xmit() should call following helper in their setup instead of the prior : dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; -> netif_keep_dst(dev); Instead of using a single bit, we use two bits, one being eventually rebuilt in bonding/team drivers. The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being rebuilt in bonding/team. Eventually, we could add something smarter later. Signed-off-by: Eric Dumazet Cc: Julian Anastasov Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- drivers/net/appletalk/ipddp.c | 2 +- drivers/net/bonding/bond_main.c | 9 ++++++--- drivers/net/eql.c | 2 +- drivers/net/ifb.c | 3 ++- drivers/net/loopback.c | 2 +- drivers/net/macvlan.c | 3 ++- drivers/net/ppp/ppp_generic.c | 2 +- drivers/net/team/team.c | 8 +++++--- drivers/net/vxlan.c | 2 +- drivers/net/wan/hdlc_fr.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- include/linux/netdevice.h | 8 ++++++++ net/8021q/vlan_dev.c | 3 ++- net/atm/clip.c | 2 +- net/core/dev.c | 19 +++++++++---------- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 2 +- net/sched/cls_flow.c | 2 ++ net/sched/cls_route.c | 1 + net/sched/sch_generic.c | 3 --- net/sched/sch_teql.c | 2 +- 27 files changed, 54 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 13e6e0431592..58b5aa3b6f2d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1364,7 +1364,7 @@ void ipoib_setup(struct net_device *dev) dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 10d0dba572c2..e90c6a7333d7 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -74,7 +74,7 @@ static struct net_device * __init ipddp_init(void) if (!dev) return ERR_PTR(-ENOMEM); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); strcpy(dev->name, "ipddp%d"); if (version_printed++ == 0) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3ad5413d4f57..c9ac06cfe6b7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1002,7 +1002,8 @@ static netdev_features_t bond_fix_features(struct net_device *dev, static void bond_compute_features(struct bonding *bond) { - unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | + IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; struct net_device *bond_dev = bond->dev; @@ -1038,8 +1039,10 @@ done: bond_dev->gso_max_segs = gso_max_segs; netif_set_gso_max_size(bond_dev, gso_max_size); - flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE; - bond_dev->priv_flags = flags | dst_release_flag; + bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } diff --git a/drivers/net/eql.c b/drivers/net/eql.c index 957e5c0cede3..a10ad74cc8d2 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -199,7 +199,7 @@ static void __init eql_setup(struct net_device *dev) dev->type = ARPHRD_SLIP; dev->tx_queue_len = 5; /* Hands them off fast */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int eql_open(struct net_device *dev) diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index d2d4a3d2237f..34f846b4bd05 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -185,7 +185,8 @@ static void ifb_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); eth_hw_addr_random(dev); } diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 8f2262540561..c76283c2f84a 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -169,7 +169,7 @@ static void loopback_setup(struct net_device *dev) dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->hw_features = NETIF_F_ALL_TSO | NETIF_F_UFO; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index e8a453f1b458..38b4fae61f04 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1025,7 +1025,8 @@ void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->netdev_ops = &macvlan_netdev_ops; dev->destructor = free_netdev; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index fa0d71727894..80e6f3430f65 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1103,7 +1103,7 @@ static void ppp_setup(struct net_device *dev) dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /* diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 2277c3679a51..a94a9df3e6bd 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -970,7 +970,8 @@ static void __team_compute_features(struct team *team) struct team_port *port; u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; unsigned short max_hard_header_len = ETH_HLEN; - unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | + IFF_XMIT_DST_RELEASE_PERM; list_for_each_entry(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, @@ -985,8 +986,9 @@ static void __team_compute_features(struct team *team) team->dev->vlan_features = vlan_features; team->dev->hard_header_len = max_hard_header_len; - flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE; - team->dev->priv_flags = flags | dst_release_flag; + team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(team->dev); } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2af795d6ba05..2a51e6e48e1e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2193,7 +2193,7 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; INIT_LIST_HEAD(&vxlan->next); diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index e5c7e6165a4b..3ebed1c40abb 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -1047,7 +1047,7 @@ static void pvc_setup(struct net_device *dev) dev->flags = IFF_POINTOPOINT; dev->hard_header_len = 10; dev->addr_len = 2; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static const struct net_device_ops pvc_ops = { diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index f8427a2c4840..afebb9709763 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3306,7 +3306,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) card->dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER; - card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(card->dev); card->dev->gso_max_size = 15 * PAGE_SIZE; SET_NETDEV_DEV(card->dev, &card->gdev->dev); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2df86f50261c..3a4315b39d20 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1206,6 +1206,7 @@ enum netdev_priv_flags { IFF_SUPP_NOFCS = 1<<19, IFF_LIVE_ADDR_CHANGE = 1<<20, IFF_MACVLAN = 1<<21, + IFF_XMIT_DST_RELEASE_PERM = 1<<22, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1230,6 +1231,7 @@ enum netdev_priv_flags { #define IFF_SUPP_NOFCS IFF_SUPP_NOFCS #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN +#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM /** * struct net_device - The DEVICE structure. @@ -3588,6 +3590,12 @@ static inline bool netif_supports_nofcs(struct net_device *dev) return dev->priv_flags & IFF_SUPP_NOFCS; } +/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ +static inline void netif_keep_dst(struct net_device *dev) +{ + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); +} + extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 35a6b6b15e8a..0d441ec8763e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; diff --git a/net/atm/clip.c b/net/atm/clip.c index 1d9eaa4f041a..17e55dfecbe2 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev) /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int clip_create(int number) diff --git a/net/core/dev.c b/net/core/dev.c index a63b8c43c1b6..3c5bdaa44486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (skb->next) return skb; - /* If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); - features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); @@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); dev->num_tx_queues = txqs; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0485ef18d254..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e453cb724a95..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ea88ab3102a8..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74b677916a70..de3b1c86b8d3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int ip6gre_tunnel_init(struct net_device *dev) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d3e8888ad611..9409887fb664 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5833a2244467..d440bb585524 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /** diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0d4e27466f82..6eab37cf5345 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a5d2b20db560..4ac515f2a6ce 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, tcf_exts_change(tp, &fnew->exts, &e); tcf_em_tree_change(tp, &fnew->ematches, &t); + netif_keep_dst(qdisc_dev(tp->q)); + if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; fnew->nkeys = nkeys; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f22baae0afa..109a329b7198 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f->handle < f1->handle) break; + netif_keep_dst(qdisc_dev(tp->q)); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2b349a4de3c8..38d58e6cef07 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops); static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - skb_dst_force(skb); q->gso_skb = skb; q->qstats.requeues++; q->q.qlen++; /* it's still part of the queue */ @@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q) if (unlikely(!skb)) return 0; - WARN_ON_ONCE(skb_dst_is_noref(skb)); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 5cd291bd00e4..6ada42396a24 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev) dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static LIST_HEAD(master_dev_list); -- cgit v1.2.3 From 583d4a6885cfa75a3d189f6bb69b5c545e961c75 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 7 Oct 2014 15:04:57 +0200 Subject: net: fs_enet: Remove non NAPI RX In the probe function, use_napi is inconditionnaly set to 1. This patch removes all the code which is conditional to !use_napi, and removes use_napi which has then become useless. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 164 ++------------------- include/linux/fs_enet_pd.h | 1 - 2 files changed, 15 insertions(+), 150 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 748fd24d3d9e..71a25b4e2d5a 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -215,128 +215,6 @@ static int fs_enet_rx_napi(struct napi_struct *napi, int budget) return received; } -/* non NAPI receive function */ -static int fs_enet_rx_non_napi(struct net_device *dev) -{ - struct fs_enet_private *fep = netdev_priv(dev); - const struct fs_platform_info *fpi = fep->fpi; - cbd_t __iomem *bdp; - struct sk_buff *skb, *skbn, *skbt; - int received = 0; - u16 pkt_len, sc; - int curidx; - /* - * First, grab all of the stats for the incoming packet. - * These get messed up if we get called due to a busy condition. - */ - bdp = fep->cur_rx; - - while (((sc = CBDR_SC(bdp)) & BD_ENET_RX_EMPTY) == 0) { - - curidx = bdp - fep->rx_bd_base; - - /* - * Since we have allocated space to hold a complete frame, - * the last indicator should be set. - */ - if ((sc & BD_ENET_RX_LAST) == 0) - dev_warn(fep->dev, "rcv is not +last\n"); - - /* - * Check for errors. - */ - if (sc & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_CL | - BD_ENET_RX_NO | BD_ENET_RX_CR | BD_ENET_RX_OV)) { - fep->stats.rx_errors++; - /* Frame too long or too short. */ - if (sc & (BD_ENET_RX_LG | BD_ENET_RX_SH)) - fep->stats.rx_length_errors++; - /* Frame alignment */ - if (sc & (BD_ENET_RX_NO | BD_ENET_RX_CL)) - fep->stats.rx_frame_errors++; - /* CRC Error */ - if (sc & BD_ENET_RX_CR) - fep->stats.rx_crc_errors++; - /* FIFO overrun */ - if (sc & BD_ENET_RX_OV) - fep->stats.rx_crc_errors++; - - skb = fep->rx_skbuff[curidx]; - - dma_unmap_single(fep->dev, CBDR_BUFADDR(bdp), - L1_CACHE_ALIGN(PKT_MAXBUF_SIZE), - DMA_FROM_DEVICE); - - skbn = skb; - - } else { - - skb = fep->rx_skbuff[curidx]; - - dma_unmap_single(fep->dev, CBDR_BUFADDR(bdp), - L1_CACHE_ALIGN(PKT_MAXBUF_SIZE), - DMA_FROM_DEVICE); - - /* - * Process the incoming frame. - */ - fep->stats.rx_packets++; - pkt_len = CBDR_DATLEN(bdp) - 4; /* remove CRC */ - fep->stats.rx_bytes += pkt_len + 4; - - if (pkt_len <= fpi->rx_copybreak) { - /* +2 to make IP header L1 cache aligned */ - skbn = netdev_alloc_skb(dev, pkt_len + 2); - if (skbn != NULL) { - skb_reserve(skbn, 2); /* align IP header */ - skb_copy_from_linear_data(skb, - skbn->data, pkt_len); - /* swap */ - skbt = skb; - skb = skbn; - skbn = skbt; - } - } else { - skbn = netdev_alloc_skb(dev, ENET_RX_FRSIZE); - - if (skbn) - skb_align(skbn, ENET_RX_ALIGN); - } - - if (skbn != NULL) { - skb_put(skb, pkt_len); /* Make room */ - skb->protocol = eth_type_trans(skb, dev); - received++; - netif_rx(skb); - } else { - fep->stats.rx_dropped++; - skbn = skb; - } - } - - fep->rx_skbuff[curidx] = skbn; - CBDW_BUFADDR(bdp, dma_map_single(fep->dev, skbn->data, - L1_CACHE_ALIGN(PKT_MAXBUF_SIZE), - DMA_FROM_DEVICE)); - CBDW_DATLEN(bdp, 0); - CBDW_SC(bdp, (sc & ~BD_ENET_RX_STATS) | BD_ENET_RX_EMPTY); - - /* - * Update BD pointer to next entry. - */ - if ((sc & BD_ENET_RX_WRAP) == 0) - bdp++; - else - bdp = fep->rx_bd_base; - - (*fep->ops->rx_bd_done)(dev); - } - - fep->cur_rx = bdp; - - return 0; -} - static void fs_enet_tx(struct net_device *dev) { struct fs_enet_private *fep = netdev_priv(dev); @@ -453,8 +331,7 @@ fs_enet_interrupt(int irq, void *dev_id) nr++; int_clr_events = int_events; - if (fpi->use_napi) - int_clr_events &= ~fep->ev_napi_rx; + int_clr_events &= ~fep->ev_napi_rx; (*fep->ops->clear_int_events)(dev, int_clr_events); @@ -462,19 +339,15 @@ fs_enet_interrupt(int irq, void *dev_id) (*fep->ops->ev_error)(dev, int_events); if (int_events & fep->ev_rx) { - if (!fpi->use_napi) - fs_enet_rx_non_napi(dev); - else { - napi_ok = napi_schedule_prep(&fep->napi); - - (*fep->ops->napi_disable_rx)(dev); - (*fep->ops->clear_int_events)(dev, fep->ev_napi_rx); - - /* NOTE: it is possible for FCCs in NAPI mode */ - /* to submit a spurious interrupt while in poll */ - if (napi_ok) - __napi_schedule(&fep->napi); - } + napi_ok = napi_schedule_prep(&fep->napi); + + (*fep->ops->napi_disable_rx)(dev); + (*fep->ops->clear_int_events)(dev, fep->ev_napi_rx); + + /* NOTE: it is possible for FCCs in NAPI mode */ + /* to submit a spurious interrupt while in poll */ + if (napi_ok) + __napi_schedule(&fep->napi); } if (int_events & fep->ev_tx) @@ -811,24 +684,21 @@ static int fs_enet_open(struct net_device *dev) /* not doing this, will cause a crash in fs_enet_rx_napi */ fs_init_bds(fep->ndev); - if (fep->fpi->use_napi) - napi_enable(&fep->napi); + napi_enable(&fep->napi); /* Install our interrupt handler. */ r = request_irq(fep->interrupt, fs_enet_interrupt, IRQF_SHARED, "fs_enet-mac", dev); if (r != 0) { dev_err(fep->dev, "Could not allocate FS_ENET IRQ!"); - if (fep->fpi->use_napi) - napi_disable(&fep->napi); + napi_disable(&fep->napi); return -EINVAL; } err = fs_init_phy(dev); if (err) { free_irq(fep->interrupt, dev); - if (fep->fpi->use_napi) - napi_disable(&fep->napi); + napi_disable(&fep->napi); return err; } phy_start(fep->phydev); @@ -845,8 +715,7 @@ static int fs_enet_close(struct net_device *dev) netif_stop_queue(dev); netif_carrier_off(dev); - if (fep->fpi->use_napi) - napi_disable(&fep->napi); + napi_disable(&fep->napi); phy_stop(fep->phydev); spin_lock_irqsave(&fep->lock, flags); @@ -1022,7 +891,6 @@ static int fs_enet_probe(struct platform_device *ofdev) fpi->rx_ring = 32; fpi->tx_ring = 32; fpi->rx_copybreak = 240; - fpi->use_napi = 1; fpi->napi_weight = 17; fpi->phy_node = of_parse_phandle(ofdev->dev.of_node, "phy-handle", 0); if (!fpi->phy_node && of_phy_is_fixed_link(ofdev->dev.of_node)) { @@ -1102,9 +970,7 @@ static int fs_enet_probe(struct platform_device *ofdev) ndev->netdev_ops = &fs_enet_netdev_ops; ndev->watchdog_timeo = 2 * HZ; - if (fpi->use_napi) - netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi, - fpi->napi_weight); + netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi, fpi->napi_weight); ndev->ethtool_ops = &fs_ethtool_ops; diff --git a/include/linux/fs_enet_pd.h b/include/linux/fs_enet_pd.h index efb05961bdd8..77d783f71527 100644 --- a/include/linux/fs_enet_pd.h +++ b/include/linux/fs_enet_pd.h @@ -139,7 +139,6 @@ struct fs_platform_info { int rx_ring, tx_ring; /* number of buffers on rx */ __u8 macaddr[ETH_ALEN]; /* mac address */ int rx_copybreak; /* limit we copy small frames */ - int use_napi; /* use NAPI */ int napi_weight; /* NAPI weight */ int use_rmii; /* use RMII mode */ -- cgit v1.2.3 From 709c48b39ecf11a81f3820c13a828c330fd832b9 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 8 Oct 2014 23:53:39 +0900 Subject: net: description of dma_cookie cause make xmldocs warning In commit 7bced397510ab569d31de4c70b39e13355046387, dma_cookie was removed from struct skbuff. But the description of dma_cookie still exist. So the "make xmldocs" output following warning. Warning(.//include/linux/skbuff.h:609): Excess struct/union /enum/typedef member 'dma_cookie' description in 'sk_buff' Remove description of dma_cookie fix the symptom. Signed-off-by: Masanari Iida Acked-by: Dan Williams Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3a5ec7638627..776104ba02ce 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -483,8 +483,6 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS - * @dma_cookie: a cookie to one of several possible DMA operations - * done by skb DMA functions * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark -- cgit v1.2.3 From 535114539bb2c081b6680cb5a34be17e7b45df37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Oct 2014 08:19:27 -0700 Subject: net: add netdev_txq_bql_{enqueue, complete}_prefetchw() helpers Add two helpers so that drivers do not have to care of BQL being available or not. Signed-off-by: Eric Dumazet Reported-by: Jim Davis Fixes: 29d40c903247 ("net/mlx4_en: Use prefetch in tx path") Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 5 +++-- include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8726a4aee5a7..34c137878545 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -392,7 +392,8 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, if (!priv->port_up) return true; - prefetchw(&ring->tx_queue->dql.limit); + netdev_txq_bql_complete_prefetchw(ring->tx_queue); + index = cons_index & size_mask; cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb); @@ -737,7 +738,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) vlan_tag = vlan_tx_tag_get(skb); - prefetchw(&ring->tx_queue->dql); + netdev_txq_bql_enqueue_prefetchw(ring->tx_queue); /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a4315b39d20..838407aea705 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -2480,6 +2481,34 @@ netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; } +/** + * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write + * @dev_queue: pointer to transmit queue + * + * BQL enabled drivers might use this helper in their ndo_start_xmit(), + * to give appropriate hint to the cpu. + */ +static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue) +{ +#ifdef CONFIG_BQL + prefetchw(&dev_queue->dql.num_queued); +#endif +} + +/** + * netdev_txq_bql_complete_prefetchw - prefetch bql data for write + * @dev_queue: pointer to transmit queue + * + * BQL enabled drivers might use this helper in their TX completion path, + * to give appropriate hint to the cpu. + */ +static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue) +{ +#ifdef CONFIG_BQL + prefetchw(&dev_queue->dql.limit); +#endif +} + static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { -- cgit v1.2.3