From 84c61d92bb6e9048eecc0738a83f1bf66f053026 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 1 Aug 2014 11:13:30 +0300 Subject: Bluetooth: Add convenience function to check for pending power off There are several situations where we're interested in knowing whether we're currently in the process of powering off an adapter. This patch adds a convenience function for the purpose and makes it public since we'll soon need to access it from hci_event.c as well. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b8554d429d88..c9de0d9945f5 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6281,25 +6281,35 @@ static void unpair_device_rsp(struct pending_cmd *cmd, void *data) mgmt_pending_remove(cmd); } +bool mgmt_powering_down(struct hci_dev *hdev) +{ + struct pending_cmd *cmd; + struct mgmt_mode *cp; + + cmd = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); + if (!cmd) + return false; + + cp = cmd->param; + if (!cp->val) + return true; + + return false; +} + void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 reason, bool mgmt_connected) { struct mgmt_ev_device_disconnected ev; - struct pending_cmd *power_off; struct sock *sk = NULL; - power_off = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); - if (power_off) { - struct mgmt_mode *cp = power_off->param; - - /* The connection is still in hci_conn_hash so test for 1 - * instead of 0 to know if this is the last one. - */ - if (!cp->val && hci_conn_count(hdev) == 1) { - cancel_delayed_work(&hdev->power_off); - queue_work(hdev->req_workqueue, &hdev->power_off.work); - } + /* The connection is still in hci_conn_hash so test for 1 + * instead of 0 to know if this is the last one. + */ + if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); } if (!mgmt_connected) @@ -6359,19 +6369,13 @@ void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { struct mgmt_ev_connect_failed ev; - struct pending_cmd *power_off; - - power_off = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); - if (power_off) { - struct mgmt_mode *cp = power_off->param; - /* The connection is still in hci_conn_hash so test for 1 - * instead of 0 to know if this is the last one. - */ - if (!cp->val && hci_conn_count(hdev) == 1) { - cancel_delayed_work(&hdev->power_off); - queue_work(hdev->req_workqueue, &hdev->power_off.work); - } + /* The connection is still in hci_conn_hash so test for 1 + * instead of 0 to know if this is the last one. + */ + if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); } bacpy(&ev.addr.bdaddr, bdaddr); -- cgit v1.2.3 From 432df05eb1e57adfc46df08abbedca6c3b8862f7 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 1 Aug 2014 11:13:31 +0300 Subject: Bluetooth: Create unified helper function for updating page scan Similar to our hci_update_background_scan() function we can simplify a lot of code by creating a unified helper function for doing page scan updates. This patch adds such a function to hci_core.c and updates all the relevant places to use it. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 5 +++ net/bluetooth/hci_core.c | 31 +++++++++++++++++ net/bluetooth/mgmt.c | 72 +++++++--------------------------------- 3 files changed, 48 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8394abc4fd87..cc2eb7730fbc 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -968,6 +968,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_host_le_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE)) #define lmp_host_le_br_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE_BREDR)) +#define hdev_is_powered(hdev) (test_bit(HCI_UP, &hdev->flags) && \ + !test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 @@ -1256,6 +1259,8 @@ bool hci_req_pending(struct hci_dev *hdev); void hci_req_add_le_scan_disable(struct hci_request *req); void hci_req_add_le_passive_scan(struct hci_request *req); +void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req); + struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout); struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c32d361c0cf7..a031589598b2 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -5680,3 +5680,34 @@ void hci_update_background_scan(struct hci_dev *hdev) if (err) BT_ERR("Failed to run HCI request: err %d", err); } + +void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req) +{ + u8 scan; + + if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + return; + + if (!hdev_is_powered(hdev)) + return; + + if (mgmt_powering_down(hdev)) + return; + + if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || + !list_empty(&hdev->whitelist)) + scan = SCAN_PAGE; + else + scan = SCAN_DISABLED; + + if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE)) + return; + + if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) + scan |= SCAN_INQUIRY; + + if (req) + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + else + hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); +} diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c9de0d9945f5..c2457435a670 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -129,9 +129,6 @@ static const u16 mgmt_events[] = { #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) -#define hdev_is_powered(hdev) (test_bit(HCI_UP, &hdev->flags) && \ - !test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) - struct pending_cmd { struct list_head list; u16 opcode; @@ -1536,9 +1533,11 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status) /* When the discoverable mode gets changed, make sure * that class of device has the limited discoverable - * bit correctly set. + * bit correctly set. Also update page scan based on whitelist + * entries. */ hci_req_init(&req, hdev); + hci_update_page_scan(hdev, &req); update_class(&req); hci_req_run(&req, NULL); @@ -1785,6 +1784,7 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status) if (conn_changed || discov_changed) { new_settings(hdev, cmd->sk); + hci_update_page_scan(hdev, NULL); if (discov_changed) mgmt_update_adv_data(hdev); hci_update_background_scan(hdev); @@ -1818,6 +1818,7 @@ static int set_connectable_update_settings(struct hci_dev *hdev, return err; if (changed) { + hci_update_page_scan(hdev, NULL); hci_update_background_scan(hdev); return new_settings(hdev, sk); } @@ -4381,27 +4382,6 @@ unlock: return err; } -static void set_bredr_scan(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - u8 scan = 0; - - /* Ensure that fast connectable is disabled. This function will - * not do anything if the page scan parameters are already what - * they should be. - */ - write_fast_connectable(req, false); - - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || - !list_empty(&hdev->whitelist)) - scan |= SCAN_PAGE; - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) - scan |= SCAN_INQUIRY; - - if (scan) - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - static void set_bredr_complete(struct hci_dev *hdev, u8 status) { struct pending_cmd *cmd; @@ -4507,9 +4487,8 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_req_init(&req, hdev); - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || - !list_empty(&hdev->whitelist)) - set_bredr_scan(&req); + write_fast_connectable(&req, false); + hci_update_page_scan(hdev, &req); /* Since only the advertising data flags will change, there * is no need to update the scan response data. @@ -5235,27 +5214,6 @@ unlock: return err; } -/* Helper for Add/Remove Device commands */ -static void update_page_scan(struct hci_dev *hdev, u8 scan) -{ - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - return; - - if (!hdev_is_powered(hdev)) - return; - - /* If HCI_CONNECTABLE is set then Add/Remove Device should not - * make any changes to page scanning. - */ - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) - return; - - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) - scan |= SCAN_INQUIRY; - - hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - static void device_added(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type, u8 action) { @@ -5291,8 +5249,6 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (cp->addr.type == BDADDR_BREDR) { - bool update_scan; - /* Only incoming connections action is supported for now */ if (cp->action != 0x01) { err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, @@ -5301,15 +5257,12 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - update_scan = list_empty(&hdev->whitelist); - err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr, cp->addr.type); if (err) goto unlock; - if (update_scan) - update_page_scan(hdev, SCAN_PAGE); + hci_update_page_scan(hdev, NULL); goto added; } @@ -5392,8 +5345,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (list_empty(&hdev->whitelist)) - update_page_scan(hdev, SCAN_DISABLED); + hci_update_page_scan(hdev, NULL); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); @@ -5444,7 +5396,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, kfree(b); } - update_page_scan(hdev, SCAN_DISABLED); + hci_update_page_scan(hdev, NULL); list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) { if (p->auto_connect == HCI_AUTO_CONN_DISABLED) @@ -5969,8 +5921,8 @@ static int powered_update_hci(struct hci_dev *hdev) sizeof(link_sec), &link_sec); if (lmp_bredr_capable(hdev)) { - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - set_bredr_scan(&req); + write_fast_connectable(&req, false); + hci_update_page_scan(hdev, &req); update_class(&req); update_name(&req); update_eir(&req); -- cgit v1.2.3 From 22f433dcf7c71cf075e4c42b5f36ea4352978a6d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 1 Aug 2014 11:13:32 +0300 Subject: Bluetooth: Disable page scan if all whitelisted devices are connected When we're not connectable and all whitelisted (BR/EDR) devices are connected it doesn't make sense to keep page scan enabled. This patch adds code to check for any disconnected whitelist devices and if there are none take the appropriate action in the hci_update_page_scan() function to disable page scan. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 20 +++++++++++++++++++- net/bluetooth/hci_event.c | 11 ++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index a031589598b2..217ef83fb541 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -5681,6 +5681,24 @@ void hci_update_background_scan(struct hci_dev *hdev) BT_ERR("Failed to run HCI request: err %d", err); } +static bool disconnected_whitelist_entries(struct hci_dev *hdev) +{ + struct bdaddr_list *b; + + list_for_each_entry(b, &hdev->whitelist, list) { + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); + if (!conn) + return true; + + if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) + return true; + } + + return false; +} + void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req) { u8 scan; @@ -5695,7 +5713,7 @@ void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req) return; if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || - !list_empty(&hdev->whitelist)) + disconnected_whitelist_entries(hdev)) scan = SCAN_PAGE; else scan = SCAN_DISABLED; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index be35598984d9..da7ab6b9bb69 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2071,6 +2071,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) cp.handle = ev->handle; hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES, sizeof(cp), &cp); + + hci_update_page_scan(hdev, NULL); } /* Set packet type for incoming connection */ @@ -2247,9 +2249,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, reason, mgmt_connected); - if (conn->type == ACL_LINK && - test_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) - hci_remove_link_key(hdev, &conn->dst); + if (conn->type == ACL_LINK) { + if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) + hci_remove_link_key(hdev, &conn->dst); + + hci_update_page_scan(hdev, NULL); + } params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { -- cgit v1.2.3 From 5fcb93475697911eb239f68241903eb5540803ac Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 10:03:31 +0300 Subject: Bluetooth: Remove redundant check for remote_key_dist In the smp_cmd_sign_info() function the SMP_DIST_SIGN bit is explicitly cleared early on in the function. This means that there's no need to check for it again before calling smp_distribute_keys(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index fd3294300803..40db728f044b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1168,8 +1168,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; - if (!(smp->remote_key_dist & SMP_DIST_SIGN)) - smp_distribute_keys(conn); + smp_distribute_keys(conn); hci_dev_unlock(hdev); return 0; -- cgit v1.2.3 From 2b29349044cc2cf74d4c6e23e26cd27977d91353 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 10:03:32 +0300 Subject: Bluetooth: Fix confusion between parent and child channel for 6lowpan The new_connection L2CAP channel callback creates a new channel based on the provided parent channel. The 6lowpan code was confusingly naming the child channel "pchan" and the parent channel "chan". This patch swaps the names. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 206b65ccd5b8..35ebe79c87b0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -772,16 +772,16 @@ static inline void chan_ready_cb(struct l2cap_chan *chan) ifup(dev->netdev); } -static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *chan) +static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { - struct l2cap_chan *pchan; + struct l2cap_chan *chan; - pchan = chan_open(chan); - pchan->ops = chan->ops; + chan = chan_open(pchan); + chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); - return pchan; + return chan; } static void delete_netdev(struct work_struct *work) -- cgit v1.2.3 From a24cce144b9814a17f46006dbad6056f1f5f481e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:42 +0300 Subject: Bluetooth: Fix reference counting of global L2CAP channels When looking up entries from the global L2CAP channel list there needs to be a guarantee that other code doesn't go and remove the entry after a channel has been returned by the lookup function. This patch makes sure that the channel reference is incremented before the read lock is released in the global channel lookup functions. The patch also adds the corresponding l2cap_chan_put() calls once the channels pointers are no-longer needed. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 46547b920f88..a47a14efd5aa 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1440,6 +1440,7 @@ static struct l2cap_chan *l2cap_global_chan_by_scid(int state, u16 cid, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { + l2cap_chan_hold(c); read_unlock(&chan_list_lock); return c; } @@ -1453,6 +1454,9 @@ static struct l2cap_chan *l2cap_global_chan_by_scid(int state, u16 cid, } } + if (c1) + l2cap_chan_hold(c1); + read_unlock(&chan_list_lock); return c1; @@ -1475,13 +1479,13 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) /* Client ATT sockets should override the server one */ if (__l2cap_get_chan_by_dcid(conn, L2CAP_CID_ATT)) - return; + goto put; dst_type = bdaddr_type(hcon, hcon->dst_type); /* If device is blocked, do not create a channel for it */ if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type)) - return; + goto put; /* For LE slave connections, make sure the connection interval * is in the range of the minium and maximum interval that has @@ -1517,6 +1521,8 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) clean: l2cap_chan_unlock(pchan); +put: + l2cap_chan_put(pchan); } static void l2cap_conn_ready(struct l2cap_conn *conn) @@ -1794,6 +1800,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { + l2cap_chan_hold(c); read_unlock(&chan_list_lock); return c; } @@ -1807,6 +1814,9 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, } } + if (c1) + l2cap_chan_hold(c1); + read_unlock(&chan_list_lock); return c1; @@ -3884,6 +3894,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, response: l2cap_chan_unlock(pchan); mutex_unlock(&conn->chan_lock); + l2cap_chan_put(pchan); sendresp: rsp.scid = cpu_to_le16(scid); @@ -5497,6 +5508,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); mutex_unlock(&conn->chan_lock); + l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) return 0; @@ -6845,12 +6857,12 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, struct l2cap_chan *chan; if (hcon->type != ACL_LINK) - goto drop; + goto free_skb; chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst, ACL_LINK); if (!chan) - goto drop; + goto free_skb; BT_DBG("chan %p, len %d", chan, skb->len); @@ -6864,10 +6876,14 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, bacpy(&bt_cb(skb)->bdaddr, &hcon->dst); bt_cb(skb)->psm = psm; - if (!chan->ops->recv(chan, skb)) + if (!chan->ops->recv(chan, skb)) { + l2cap_chan_put(chan); return; + } drop: + l2cap_chan_put(chan); +free_skb: kfree_skb(skb); } @@ -6878,22 +6894,26 @@ static void l2cap_att_channel(struct l2cap_conn *conn, struct l2cap_chan *chan; if (hcon->type != LE_LINK) - goto drop; + goto free_skb; chan = l2cap_global_chan_by_scid(BT_CONNECTED, L2CAP_CID_ATT, &hcon->src, &hcon->dst); if (!chan) - goto drop; + goto free_skb; BT_DBG("chan %p, len %d", chan, skb->len); if (chan->imtu < skb->len) goto drop; - if (!chan->ops->recv(chan, skb)) + if (!chan->ops->recv(chan, skb)) { + l2cap_chan_put(chan); return; + } drop: + l2cap_chan_put(chan); +free_skb: kfree_skb(skb); } -- cgit v1.2.3 From 5ff6f34d4260c542df3712e29ead87cf071ad472 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:43 +0300 Subject: Bluetooth: Fix __l2cap_no_conn_pending() usage with all channels The __l2cap_no_conn_pending() function would previously only return a meaningful value for connection oriented channels and was therefore not useful for anything else. As preparation of making the L2CAP code more generic allow the function to be called for other channel types as well by returning a meaningful value for them. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a47a14efd5aa..a0a910eb897b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1082,6 +1082,9 @@ static void l2cap_send_rr_or_rnr(struct l2cap_chan *chan, bool poll) static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan) { + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) + return true; + return !test_bit(CONF_CONNECT_PEND, &chan->conf_state); } -- cgit v1.2.3 From d52deb17489b8155e031fb1a9f116c602d719e11 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:44 +0300 Subject: Bluetooth: Resume BT_CONNECTED state after LE security elevation The LE ATT socket uses a special trick where it temporarily sets BT_CONFIG state for the duration of a security level elevation. In order to not require special hacks for going back to BT_CONNECTED state in the l2cap_core.c code the most reasonable place to resume the state is the resume callback. This patch adds a new flag to track the pending security level change and ensures that the state is set back to BT_CONNECTED in the resume callback in case the flag is set. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_sock.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 8df15ad0d43f..4a51e7596608 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -708,6 +708,7 @@ enum { FLAG_EFS_ENABLE, FLAG_DEFER_SETUP, FLAG_LE_CONN_REQ_SENT, + FLAG_PENDING_SECURITY, }; enum { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1884f72083c2..5a42f6a818c0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -790,6 +790,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, if (chan->scid == L2CAP_CID_ATT) { if (smp_conn_security(conn->hcon, sec.level)) break; + set_bit(FLAG_PENDING_SECURITY, &chan->flags); sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; @@ -1359,6 +1360,11 @@ static void l2cap_sock_resume_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) { + sk->sk_state = BT_CONNECTED; + chan->state = BT_CONNECTED; + } + clear_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); sk->sk_state_change(sk); } -- cgit v1.2.3 From 191eb398c677444bc08cb4497467ca9e2b8696bc Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:45 +0300 Subject: Bluetooth: Remove special handling of ATT in l2cap_security_cfm() With the update to sk->resume() and __l2cap_no_conn_pending() we no-longer need to have special handling of ATT channels in the l2cap_security_cfm() function. The chan->sec_level update when encryption has been enabled is safe to do for any kind of channel, and the loop takes later care of calling chan->ready() or chan->resume() if necessary. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a0a910eb897b..54cbfcd4da44 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7341,15 +7341,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) continue; } - if (chan->scid == L2CAP_CID_ATT) { - if (!status && encrypt) { - chan->sec_level = hcon->sec_level; - l2cap_chan_ready(chan); - } - - l2cap_chan_unlock(chan); - continue; - } + if (!status && encrypt) + chan->sec_level = hcon->sec_level; if (!__l2cap_no_conn_pending(chan)) { l2cap_chan_unlock(chan); -- cgit v1.2.3 From dc0f5088182b2e48ae47629a55bdf35ad5d6ed44 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:46 +0300 Subject: Bluetooth: Refactor l2cap_connect_cfm This patch is a simple refactoring of l2cap_connect_cfm to allow easier extension of the function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 54cbfcd4da44..7a5cff89b792 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7268,13 +7268,16 @@ void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); - if (!status) { - conn = l2cap_conn_add(hcon); - if (conn) - l2cap_conn_ready(conn); - } else { + if (status) { l2cap_conn_del(hcon, bt_to_errno(status)); + return; } + + conn = l2cap_conn_add(hcon); + if (!conn) + return; + + l2cap_conn_ready(conn); } int l2cap_disconn_ind(struct hci_conn *hcon) -- cgit v1.2.3 From e760ec12134d5736065c4e88b0e783cc1fe0d20a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:47 +0300 Subject: Bluetooth: Move L2CAP fixed channel creation into l2cap_conn_cfm In order to remove special handling of fixed L2CAP channels we need to start creating them in a single place instead of having per-channel exceptions. The most natural place is the l2cap_conn_cfm() function which is called whenever there is a new baseband link. The only really special case so far has been the ATT socket, so in order not to break the code in between this patch removes the ATT special handling at the same time as it adds the generic fixed channel handling from l2cap_le_conn_ready() into the hci_conn_cfm() function. As a related change the channel locking in l2cap_conn_ready() becomes simpler and we can thereby move the smp_conn_security() call into the l2cap_le_conn_ready() function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 124 +++++++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 7a5cff89b792..8c4767f91a23 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1469,26 +1469,14 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) { struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; - struct l2cap_chan *chan, *pchan; - u8 dst_type; - BT_DBG(""); - - /* Check if we have socket listening on cid */ - pchan = l2cap_global_chan_by_scid(BT_LISTEN, L2CAP_CID_ATT, - &hcon->src, &hcon->dst); - if (!pchan) - return; + BT_DBG("%s conn %p", hdev->name, conn); - /* Client ATT sockets should override the server one */ - if (__l2cap_get_chan_by_dcid(conn, L2CAP_CID_ATT)) - goto put; - - dst_type = bdaddr_type(hcon, hcon->dst_type); - - /* If device is blocked, do not create a channel for it */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type)) - goto put; + /* For outgoing pairing which doesn't necessarily have an + * associated socket (e.g. mgmt_pair_device). + */ + if (hcon->out) + smp_conn_security(hcon, hcon->pending_sec_level); /* For LE slave connections, make sure the connection interval * is in the range of the minium and maximum interval that has @@ -1508,24 +1496,6 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONN_PARAM_UPDATE_REQ, sizeof(req), &req); } - - l2cap_chan_lock(pchan); - - chan = pchan->ops->new_connection(pchan); - if (!chan) - goto clean; - - bacpy(&chan->src, &hcon->src); - bacpy(&chan->dst, &hcon->dst); - chan->src_type = bdaddr_type(hcon, hcon->src_type); - chan->dst_type = dst_type; - - __l2cap_chan_add(conn, chan); - -clean: - l2cap_chan_unlock(pchan); -put: - l2cap_chan_put(pchan); } static void l2cap_conn_ready(struct l2cap_conn *conn) @@ -1535,17 +1505,11 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - /* For outgoing pairing which doesn't necessarily have an - * associated socket (e.g. mgmt_pair_device). - */ - if (hcon->out && hcon->type == LE_LINK) - smp_conn_security(hcon, hcon->pending_sec_level); - - mutex_lock(&conn->chan_lock); - if (hcon->type == LE_LINK) l2cap_le_conn_ready(conn); + mutex_lock(&conn->chan_lock); + list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -7262,9 +7226,44 @@ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr) return exact ? lm1 : lm2; } +/* Find the next fixed channel in BT_LISTEN state, continue iteration + * from an existing channel in the list or from the beginning of the + * global list (by passing NULL as first parameter). + */ +static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, + bdaddr_t *src) +{ + read_lock(&chan_list_lock); + + if (c) + c = list_next_entry(c, global_l); + else + c = list_entry(chan_list.next, typeof(*c), global_l); + + list_for_each_entry_from(c, &chan_list, global_l) { + if (c->chan_type != L2CAP_CHAN_FIXED) + continue; + if (c->state != BT_LISTEN) + continue; + if (bacmp(&c->src, src) && bacmp(&c->src, BDADDR_ANY)) + continue; + + l2cap_chan_hold(c); + read_unlock(&chan_list_lock); + return c; + } + + read_unlock(&chan_list_lock); + + return NULL; +} + void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) { + struct hci_dev *hdev = hcon->hdev; struct l2cap_conn *conn; + struct l2cap_chan *pchan; + u8 dst_type; BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); @@ -7277,6 +7276,43 @@ void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) if (!conn) return; + dst_type = bdaddr_type(hcon, hcon->dst_type); + + /* If device is blocked, do not create channels for it */ + if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type)) + return; + + /* Find fixed channels and notify them of the new connection. We + * use multiple individual lookups, continuing each time where + * we left off, because the list lock would prevent calling the + * potentially sleeping l2cap_chan_lock() function. + */ + pchan = l2cap_global_fixed_chan(NULL, &hdev->bdaddr); + while (pchan) { + struct l2cap_chan *chan, *next; + + /* Client fixed channels should override server ones */ + if (__l2cap_get_chan_by_dcid(conn, pchan->scid)) + goto next; + + l2cap_chan_lock(pchan); + chan = pchan->ops->new_connection(pchan); + if (chan) { + bacpy(&chan->src, &hcon->src); + bacpy(&chan->dst, &hcon->dst); + chan->src_type = bdaddr_type(hcon, hcon->src_type); + chan->dst_type = dst_type; + + __l2cap_chan_add(conn, chan); + } + + l2cap_chan_unlock(pchan); +next: + next = l2cap_global_fixed_chan(pchan, &hdev->bdaddr); + l2cap_chan_put(pchan); + pchan = next; + } + l2cap_conn_ready(conn); } -- cgit v1.2.3 From 54a1b626c96039f172dd2ea15b2671053b3c5a68 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:48 +0300 Subject: Bluetooth: Improve fixed channel lookup based on link type When notifying global fixed channels of new connections it doesn't make sense to consider channels meant for a different link type than the one available. This patch adds an extra parameter to the l2cap_global_fixed_chan() lookup function and ensures that only channels matching the current hci_conn type are looked up. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 8c4767f91a23..191b58f48245 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7231,7 +7231,7 @@ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr) * global list (by passing NULL as first parameter). */ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, - bdaddr_t *src) + bdaddr_t *src, u8 link_type) { read_lock(&chan_list_lock); @@ -7247,6 +7247,10 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, continue; if (bacmp(&c->src, src) && bacmp(&c->src, BDADDR_ANY)) continue; + if (link_type == ACL_LINK && c->src_type != BDADDR_BREDR) + continue; + if (link_type == LE_LINK && c->src_type == BDADDR_BREDR) + continue; l2cap_chan_hold(c); read_unlock(&chan_list_lock); @@ -7287,7 +7291,7 @@ void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) * we left off, because the list lock would prevent calling the * potentially sleeping l2cap_chan_lock() function. */ - pchan = l2cap_global_fixed_chan(NULL, &hdev->bdaddr); + pchan = l2cap_global_fixed_chan(NULL, &hdev->bdaddr, hcon->type); while (pchan) { struct l2cap_chan *chan, *next; @@ -7308,7 +7312,8 @@ void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) l2cap_chan_unlock(pchan); next: - next = l2cap_global_fixed_chan(pchan, &hdev->bdaddr); + next = l2cap_global_fixed_chan(pchan, &hdev->bdaddr, + hcon->type); l2cap_chan_put(pchan); pchan = next; } -- cgit v1.2.3 From 06171e0546434b006a3857ea745b4258ad5d677c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:49 +0300 Subject: Bluetooth: Remove special ATT data channel handling Now that we've got the fixed channel infrastructure cleaned up in a generic way there's no longer a need to have a dedicated function for handling data on the ATT channel. Instead the generic l2cap_data_channel() handler will be able to do the exact same thing. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 79 ---------------------------------------------- 1 file changed, 79 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 191b58f48245..09eb4e3e8ab2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1420,51 +1420,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) mutex_unlock(&conn->chan_lock); } -/* Find socket with cid and source/destination bdaddr. - * Returns closest match, locked. - */ -static struct l2cap_chan *l2cap_global_chan_by_scid(int state, u16 cid, - bdaddr_t *src, - bdaddr_t *dst) -{ - struct l2cap_chan *c, *c1 = NULL; - - read_lock(&chan_list_lock); - - list_for_each_entry(c, &chan_list, global_l) { - if (state && c->state != state) - continue; - - if (c->scid == cid) { - int src_match, dst_match; - int src_any, dst_any; - - /* Exact match. */ - src_match = !bacmp(&c->src, src); - dst_match = !bacmp(&c->dst, dst); - if (src_match && dst_match) { - l2cap_chan_hold(c); - read_unlock(&chan_list_lock); - return c; - } - - /* Closest match */ - src_any = !bacmp(&c->src, BDADDR_ANY); - dst_any = !bacmp(&c->dst, BDADDR_ANY); - if ((src_match && dst_any) || (src_any && dst_match) || - (src_any && dst_any)) - c1 = c; - } - } - - if (c1) - l2cap_chan_hold(c1); - - read_unlock(&chan_list_lock); - - return c1; -} - static void l2cap_le_conn_ready(struct l2cap_conn *conn) { struct hci_conn *hcon = conn->hcon; @@ -6854,36 +6809,6 @@ free_skb: kfree_skb(skb); } -static void l2cap_att_channel(struct l2cap_conn *conn, - struct sk_buff *skb) -{ - struct hci_conn *hcon = conn->hcon; - struct l2cap_chan *chan; - - if (hcon->type != LE_LINK) - goto free_skb; - - chan = l2cap_global_chan_by_scid(BT_CONNECTED, L2CAP_CID_ATT, - &hcon->src, &hcon->dst); - if (!chan) - goto free_skb; - - BT_DBG("chan %p, len %d", chan, skb->len); - - if (chan->imtu < skb->len) - goto drop; - - if (!chan->ops->recv(chan, skb)) { - l2cap_chan_put(chan); - return; - } - -drop: - l2cap_chan_put(chan); -free_skb: - kfree_skb(skb); -} - static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_hdr *lh = (void *) skb->data; @@ -6929,10 +6854,6 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) l2cap_conless_channel(conn, psm, skb); break; - case L2CAP_CID_ATT: - l2cap_att_channel(conn, skb); - break; - case L2CAP_CID_LE_SIGNALING: l2cap_le_sig_channel(conn, skb); break; -- cgit v1.2.3 From 148243087b3a5d95a32825da26858dc9d893b141 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 7 Aug 2014 22:56:50 +0300 Subject: Bluetooth: Move parts of fixed channel initialization to l2cap_add_scid The l2cap_add_scid function is used for registering a fixed L2CAP channel. Instead of having separate initialization of the channel type and outgoing MTU in l2cap_sock.c it's more intuitive to do these things in the l2cap_add_scid function itself (and thereby make the functionality available to other users besides l2cap_sock.c). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 4 ++++ net/bluetooth/l2cap_sock.c | 9 --------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 09eb4e3e8ab2..344a29c53227 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -210,6 +210,10 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid) { write_lock(&chan_list_lock); + /* Override the defaults (which are for conn-oriented) */ + chan->omtu = L2CAP_DEFAULT_MTU; + chan->chan_type = L2CAP_CHAN_FIXED; + chan->scid = scid; write_unlock(&chan_list_lock); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 5a42f6a818c0..ed06f88e6f10 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -99,15 +99,6 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) if (!bdaddr_type_is_valid(la.l2_bdaddr_type)) return -EINVAL; - if (la.l2_cid) { - /* When the socket gets created it defaults to - * CHAN_CONN_ORIENTED, so we need to overwrite the - * default here. - */ - chan->chan_type = L2CAP_CHAN_FIXED; - chan->omtu = L2CAP_DEFAULT_MTU; - } - if (bdaddr_type_is_le(la.l2_bdaddr_type)) { /* We only allow ATT user space socket */ if (la.l2_cid && -- cgit v1.2.3 From 72847ce02180e8a0be1b23ba53ffe437cdb25d6a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:28:03 +0300 Subject: Bluetooth: Call L2CAP teardown callback before clearing chan->conn L2CAP channel implementations may want to still access the chan->conn pointer. This will particularly be the case for SMP that will want to clear a reference to the SMP channel in the l2cap_conn structure. The only user of the teardown callback so far is l2cap_sock.c and for the code there it makes no difference whether the callback is called before or after clearing the chan->conn pointer. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 344a29c53227..c6419f40cfba 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -566,6 +566,8 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) BT_DBG("chan %p, conn %p, err %d", chan, conn, err); + chan->ops->teardown(chan, err); + if (conn) { struct amp_mgr *mgr = conn->hcon->amp_mgr; /* Delete from channel list */ @@ -589,8 +591,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) amp_disconnect_logical_link(hs_hchan); } - chan->ops->teardown(chan, err); - if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state)) return; -- cgit v1.2.3 From 79a0572736ad4b9cac7be72d4402f9c79db8ebaf Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:28:04 +0300 Subject: Bluetooth: Call l2cap_le_conn_ready after notifying channels For most cases it makes no difference whether l2cap_le_conn_ready() is called before or after calling the channel ready() callbacks, however for upcoming SMP code we need this as the ready() callback initializes certain structures that a call to smp_conn_security() from l2cap_le_conn_ready() depends on. Therefore, move the call to l2cap_le_conn_ready() after iterating through and notifying channels. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index c6419f40cfba..8acfe6f57b5e 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1464,9 +1464,6 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - if (hcon->type == LE_LINK) - l2cap_le_conn_ready(conn); - mutex_lock(&conn->chan_lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1492,6 +1489,9 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) mutex_unlock(&conn->chan_lock); + if (hcon->type == LE_LINK) + l2cap_le_conn_ready(conn); + queue_work(hcon->hdev->workqueue, &conn->pending_rx_work); } -- cgit v1.2.3 From d3368605591b88cd8af522adadb1c460a8f8e7bb Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:28:05 +0300 Subject: Bluetooth: Fix using HCI_CONN_LE_SMP_PEND to check for SMP context The code is consistently using the HCI_CONN_LE_SMP_PEND flag check for the existence of the SMP context, with the exception of this one place in smp_sig_channel(). This patch converts the place to use the flag just like all other instances. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 40db728f044b..33016ec9b247 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1206,7 +1206,7 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) * returns an error). */ if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && - !conn->smp_chan) { + !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); kfree_skb(skb); return -EOPNOTSUPP; -- cgit v1.2.3 From fabed38fcf456cc5d3e6946fab78855aa65bd40b Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:32:48 +0300 Subject: Bluetooth: Fix hci_update_random_address() error return for no crypto If the AES crypto context is not available we cannot generate new RPAs. We should therefore cleanly return an error from the function responsible for updating the random address. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 217ef83fb541..860477090d78 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3882,6 +3882,12 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, !bacmp(&hdev->random_addr, &hdev->rpa)) return 0; + if (!hdev->tfm_aes) { + BT_ERR("%s crypto not available to generate RPA", + hdev->name); + return -EOPNOTSUPP; + } + err = smp_generate_rpa(hdev->tfm_aes, hdev->irk, &hdev->rpa); if (err < 0) { BT_ERR("%s failed to generate new RPA", hdev->name); -- cgit v1.2.3 From 893ededeb189aa48c308116a7acd793efae5c830 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:32:49 +0300 Subject: Bluetooth: Fix IRK lookup when tfm_aes is not available If the AES crypto has not been initialized properly we should cleanly return from the hci_find_irk_by_rpa() function. Right now this will not happen in practice, but once (in subsequent patches) SMP init is moved to after the HCI init procedure it is possible that the pointer is NULL. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 860477090d78..4a1ec259099e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3232,6 +3232,9 @@ struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) return irk; } + if (!hdev->tfm_aes) + return NULL; + list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev->tfm_aes, irk->val, rpa)) { bacpy(&irk->rpa, rpa); -- cgit v1.2.3 From 222916e3e509f04678d0b6f13f7b17bbc8dd14b6 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:32:50 +0300 Subject: Bluetooth: Refactor SMP (de)initialization into separate functions As preparation for converting SMP to use the l2cap_chan infrastructure refactor the (de)initialization into separate functions. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4a1ec259099e..1f691c50abbc 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1764,6 +1764,34 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) } } +static int hci_register_smp(struct hci_dev *hdev) +{ + int err; + + BT_DBG("%s", hdev->name); + + hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(hdev->tfm_aes)) { + BT_ERR("Unable to create crypto context"); + err = PTR_ERR(hdev->tfm_aes); + hdev->tfm_aes = NULL; + return err; + } + + return 0; +} + +static void hci_unregister_smp(struct hci_dev *hdev) +{ + BT_DBG("%s", hdev->name); + + if (hdev->tfm_aes) { + crypto_free_blkcipher(hdev->tfm_aes); + hdev->tfm_aes = NULL; + } +} + static int __hci_init(struct hci_dev *hdev) { int err; @@ -4099,18 +4127,13 @@ int hci_register_dev(struct hci_dev *hdev) dev_set_name(&hdev->dev, "%s", hdev->name); - hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(hdev->tfm_aes)) { - BT_ERR("Unable to create crypto context"); - error = PTR_ERR(hdev->tfm_aes); - hdev->tfm_aes = NULL; + error = hci_register_smp(hdev); + if (error) goto err_wqueue; - } error = device_add(&hdev->dev); if (error < 0) - goto err_tfm; + goto err_smp; hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, @@ -4152,8 +4175,8 @@ int hci_register_dev(struct hci_dev *hdev) return id; -err_tfm: - crypto_free_blkcipher(hdev->tfm_aes); +err_smp: + hci_unregister_smp(hdev); err_wqueue: destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); @@ -4205,8 +4228,7 @@ void hci_unregister_dev(struct hci_dev *hdev) rfkill_destroy(hdev->rfkill); } - if (hdev->tfm_aes) - crypto_free_blkcipher(hdev->tfm_aes); + hci_unregister_smp(hdev); device_del(&hdev->dev); -- cgit v1.2.3 From 54506918059a5bdbf396f34f2e0a2735803024db Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:32:51 +0300 Subject: Bluetooth: Move SMP initialization after HCI init First of all, it's wasteful to initialize SMP if it's never going to be used (e.g. on non-LE controllers). Second of all, when we move to use l2cap_chan we need to know the real local address, meaning we must have completed at least part of the HCI init. This patch moves the SMP initialization to after the HCI init procedure and makes it depend on whether the controller actually supports LE. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1f691c50abbc..9eb2869b183b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1926,6 +1926,8 @@ static int __hci_init(struct hci_dev *hdev) debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + + hci_register_smp(hdev); } return 0; @@ -4127,13 +4129,9 @@ int hci_register_dev(struct hci_dev *hdev) dev_set_name(&hdev->dev, "%s", hdev->name); - error = hci_register_smp(hdev); - if (error) - goto err_wqueue; - error = device_add(&hdev->dev); if (error < 0) - goto err_smp; + goto err_wqueue; hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, @@ -4175,8 +4173,6 @@ int hci_register_dev(struct hci_dev *hdev) return id; -err_smp: - hci_unregister_smp(hdev); err_wqueue: destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); -- cgit v1.2.3 From 711eafe345d993cf4831e890fa989d02c06cad62 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:32:52 +0300 Subject: Bluetooth: Move SMP (de)initialization to smp.c As preparation for moving SMP to use l2cap_chan infrastructure we need to move the (de)initialization functions to smp.c (where they'll eventually need access to the local L2CAP channel callbacks). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 32 ++------------------------------ net/bluetooth/smp.c | 26 ++++++++++++++++++++++++++ net/bluetooth/smp.h | 3 +++ 3 files changed, 31 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9eb2869b183b..88575a633601 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1764,34 +1764,6 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) } } -static int hci_register_smp(struct hci_dev *hdev) -{ - int err; - - BT_DBG("%s", hdev->name); - - hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(hdev->tfm_aes)) { - BT_ERR("Unable to create crypto context"); - err = PTR_ERR(hdev->tfm_aes); - hdev->tfm_aes = NULL; - return err; - } - - return 0; -} - -static void hci_unregister_smp(struct hci_dev *hdev) -{ - BT_DBG("%s", hdev->name); - - if (hdev->tfm_aes) { - crypto_free_blkcipher(hdev->tfm_aes); - hdev->tfm_aes = NULL; - } -} - static int __hci_init(struct hci_dev *hdev) { int err; @@ -1927,7 +1899,7 @@ static int __hci_init(struct hci_dev *hdev) hdev->debugfs, &hdev->discov_interleaved_timeout); - hci_register_smp(hdev); + smp_register(hdev); } return 0; @@ -4224,7 +4196,7 @@ void hci_unregister_dev(struct hci_dev *hdev) rfkill_destroy(hdev->rfkill); } - hci_unregister_smp(hdev); + smp_unregister(hdev); device_del(&hdev->dev); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 33016ec9b247..ab07649ecc77 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1455,3 +1455,29 @@ int smp_distribute_keys(struct l2cap_conn *conn) return 0; } + +int smp_register(struct hci_dev *hdev) +{ + BT_DBG("%s", hdev->name); + + hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(hdev->tfm_aes)) { + int err = PTR_ERR(hdev->tfm_aes); + BT_ERR("Unable to create crypto context"); + hdev->tfm_aes = NULL; + return err; + } + + return 0; +} + +void smp_unregister(struct hci_dev *hdev) +{ + BT_DBG("%s", hdev->name); + + if (hdev->tfm_aes) { + crypto_free_blkcipher(hdev->tfm_aes); + hdev->tfm_aes = NULL; + } +} diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 796f4f45f92f..6e29359f60a3 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -136,4 +136,7 @@ bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *bdaddr); int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa); +int smp_register(struct hci_dev *hdev); +void smp_unregister(struct hci_dev *hdev); + #endif /* __SMP_H */ -- cgit v1.2.3 From 70db83c4bcdc1447bbcb318389561c90d7056b18 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:16 +0300 Subject: Bluetooth: Add SMP L2CAP channel skeleton This patch creates the initial SMP L2CAP channels and a skeleton for their callbacks. There is one per-adapter channel created upon adapter registration, and then one channel per-connection created through the new_connection callback. The channels are registered with the reserved CID 0x1f for now in order to not conflict with existing SMP functionality. Once everything is in place the value can be changed to what it should be, i.e. L2CAP_CID_SMP. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/l2cap.h | 1 + net/bluetooth/smp.c | 131 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 132 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cc2eb7730fbc..2571fc1cb1c5 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -303,6 +303,7 @@ struct hci_dev { __u32 req_result; struct crypto_blkcipher *tfm_aes; + void *smp_data; struct discovery_state discovery; struct hci_conn_hash conn_hash; diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index a72965f6bc2f..1a037ba4b6f4 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -637,6 +637,7 @@ struct l2cap_conn { struct delayed_work security_timer; struct smp_chan *smp_chan; + struct l2cap_chan *smp; struct list_head chan_l; struct mutex chan_lock; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ab07649ecc77..2362ae35a4d5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1456,8 +1456,106 @@ int smp_distribute_keys(struct l2cap_conn *conn) return 0; } +static void smp_teardown_cb(struct l2cap_chan *chan, int err) +{ + struct l2cap_conn *conn = chan->conn; + + BT_DBG("chan %p", chan); + + conn->smp = NULL; + l2cap_chan_put(chan); +} + +static void smp_ready_cb(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + + BT_DBG("chan %p", chan); + + conn->smp = chan; + l2cap_chan_hold(chan); +} + +static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, + unsigned long hdr_len, + unsigned long len, int nb) +{ + struct sk_buff *skb; + + skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + skb->priority = HCI_PRIO_MAX; + bt_cb(skb)->chan = chan; + + return skb; +} + +static const struct l2cap_ops smp_chan_ops = { + .name = "Security Manager", + .ready = smp_ready_cb, + .alloc_skb = smp_alloc_skb_cb, + .teardown = smp_teardown_cb, + + .new_connection = l2cap_chan_no_new_connection, + .recv = l2cap_chan_no_recv, + .state_change = l2cap_chan_no_state_change, + .close = l2cap_chan_no_close, + .defer = l2cap_chan_no_defer, + .suspend = l2cap_chan_no_suspend, + .resume = l2cap_chan_no_resume, + .set_shutdown = l2cap_chan_no_set_shutdown, + .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, +}; + +static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) +{ + struct l2cap_chan *chan; + + BT_DBG("pchan %p", pchan); + + chan = l2cap_chan_create(); + if (!chan) + return NULL; + + chan->chan_type = pchan->chan_type; + chan->ops = &smp_chan_ops; + chan->scid = pchan->scid; + chan->dcid = chan->scid; + chan->imtu = pchan->imtu; + chan->omtu = pchan->omtu; + chan->mode = pchan->mode; + + BT_DBG("created chan %p", chan); + + return chan; +} + +static const struct l2cap_ops smp_root_chan_ops = { + .name = "Security Manager Root", + .new_connection = smp_new_conn_cb, + + /* None of these are implemented for the root channel */ + .close = l2cap_chan_no_close, + .alloc_skb = l2cap_chan_no_alloc_skb, + .recv = l2cap_chan_no_recv, + .state_change = l2cap_chan_no_state_change, + .teardown = l2cap_chan_no_teardown, + .ready = l2cap_chan_no_ready, + .defer = l2cap_chan_no_defer, + .suspend = l2cap_chan_no_suspend, + .resume = l2cap_chan_no_resume, + .set_shutdown = l2cap_chan_no_set_shutdown, + .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, +}; + int smp_register(struct hci_dev *hdev) { + struct l2cap_chan *chan; + BT_DBG("%s", hdev->name); hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, @@ -1469,15 +1567,46 @@ int smp_register(struct hci_dev *hdev) return err; } + chan = l2cap_chan_create(); + if (!chan) { + crypto_free_blkcipher(hdev->tfm_aes); + hdev->tfm_aes = NULL; + return -ENOMEM; + } + + /* FIXME: Using reserved 0x1f value for now - to be changed to + * L2CAP_CID_SMP once all functionality is in place. + */ + l2cap_add_scid(chan, 0x1f); + + l2cap_chan_set_defaults(chan); + + bacpy(&chan->src, &hdev->bdaddr); + chan->src_type = BDADDR_LE_PUBLIC; + chan->state = BT_LISTEN; + chan->mode = L2CAP_MODE_BASIC; + chan->imtu = L2CAP_DEFAULT_MTU; + chan->ops = &smp_root_chan_ops; + + hdev->smp_data = chan; + return 0; } void smp_unregister(struct hci_dev *hdev) { - BT_DBG("%s", hdev->name); + struct l2cap_chan *chan = hdev->smp_data; + + if (!chan) + return; + + BT_DBG("%s chan %p", hdev->name, chan); if (hdev->tfm_aes) { crypto_free_blkcipher(hdev->tfm_aes); hdev->tfm_aes = NULL; } + + hdev->smp_data = NULL; + l2cap_chan_put(chan); } -- cgit v1.2.3 From defce9e83666658d4420d65e45ab1ad190992f72 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:17 +0300 Subject: Bluetooth: Make AES crypto context private to SMP Now that we have per-adapter SMP data thanks to the root SMP L2CAP channel we can take advantage of it and attach the AES crypto context (only used for SMP) to it. This means that the smp_irk_matches() and smp_generate_rpa() function can be converted to internally handle the AES context. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - net/bluetooth/hci_core.c | 13 ++----------- net/bluetooth/smp.c | 41 +++++++++++++++++++++++++++------------- net/bluetooth/smp.h | 5 ++--- 4 files changed, 32 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2571fc1cb1c5..5f0b77b71b45 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -302,7 +302,6 @@ struct hci_dev { __u32 req_status; __u32 req_result; - struct crypto_blkcipher *tfm_aes; void *smp_data; struct discovery_state discovery; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 88575a633601..abeb5e47311e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3234,11 +3234,8 @@ struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) return irk; } - if (!hdev->tfm_aes) - return NULL; - list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { - if (smp_irk_matches(hdev->tfm_aes, irk->val, rpa)) { + if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); return irk; } @@ -3887,13 +3884,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, !bacmp(&hdev->random_addr, &hdev->rpa)) return 0; - if (!hdev->tfm_aes) { - BT_ERR("%s crypto not available to generate RPA", - hdev->name); - return -EOPNOTSUPP; - } - - err = smp_generate_rpa(hdev->tfm_aes, hdev->irk, &hdev->rpa); + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { BT_ERR("%s failed to generate new RPA", hdev->name); return err; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 2362ae35a4d5..6925fc4caaee 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -139,12 +139,18 @@ static int smp_ah(struct crypto_blkcipher *tfm, u8 irk[16], u8 r[3], u8 res[3]) return 0; } -bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], - bdaddr_t *bdaddr) +bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr) { + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm; u8 hash[3]; int err; + if (!chan || !chan->data) + return false; + + tfm = chan->data; + BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(tfm, irk, &bdaddr->b[3], hash); @@ -154,10 +160,17 @@ bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], return !memcmp(bdaddr->b, hash, 3); } -int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa) +int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa) { + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm; int err; + if (!chan || !chan->data) + return -EOPNOTSUPP; + + tfm = chan->data; + get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ @@ -1555,25 +1568,25 @@ static const struct l2cap_ops smp_root_chan_ops = { int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; + struct crypto_blkcipher *tfm_aes; BT_DBG("%s", hdev->name); - hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(hdev->tfm_aes)) { - int err = PTR_ERR(hdev->tfm_aes); + tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_aes)) { + int err = PTR_ERR(tfm_aes); BT_ERR("Unable to create crypto context"); - hdev->tfm_aes = NULL; return err; } chan = l2cap_chan_create(); if (!chan) { - crypto_free_blkcipher(hdev->tfm_aes); - hdev->tfm_aes = NULL; + crypto_free_blkcipher(tfm_aes); return -ENOMEM; } + chan->data = tfm_aes; + /* FIXME: Using reserved 0x1f value for now - to be changed to * L2CAP_CID_SMP once all functionality is in place. */ @@ -1596,15 +1609,17 @@ int smp_register(struct hci_dev *hdev) void smp_unregister(struct hci_dev *hdev) { struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm_aes; if (!chan) return; BT_DBG("%s chan %p", hdev->name, chan); - if (hdev->tfm_aes) { - crypto_free_blkcipher(hdev->tfm_aes); - hdev->tfm_aes = NULL; + tfm_aes = chan->data; + if (tfm_aes) { + chan->data = NULL; + crypto_free_blkcipher(tfm_aes); } hdev->smp_data = NULL; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 6e29359f60a3..d3f6bd617940 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -132,9 +132,8 @@ int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); void smp_chan_destroy(struct l2cap_conn *conn); -bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], - bdaddr_t *bdaddr); -int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa); +bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr); +int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa); int smp_register(struct hci_dev *hdev); void smp_unregister(struct hci_dev *hdev); -- cgit v1.2.3 From 5d88cc73dded31a93fcc4821f33a8c3d755bf454 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 8 Aug 2014 09:37:18 +0300 Subject: Bluetooth: Convert SMP to use l2cap_chan infrastructure Now that we have all the necessary pieces in place we can fully convert SMP to use the L2CAP channel infrastructure. This patch adds the necessary callbacks and removes the now unneeded conn->smp_chan pointer. One notable behavioral change in this patch comes from the following code snippet: - case L2CAP_CID_SMP: - if (smp_sig_channel(conn, skb)) - l2cap_conn_del(conn->hcon, EACCES); This piece of code was essentially forcing a disconnection if garbage SMP data was received. The l2cap_conn_del() function is private to l2cap_conn.c so we don't have access to it anymore when using the L2CAP channel callbacks. Therefore, the behavior of the new code is simply to return errors in the recv() callback (which is simply the old smp_sig_channel()), but no disconnection will occur. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 - net/bluetooth/l2cap_core.c | 10 ---- net/bluetooth/smp.c | 122 +++++++++++++++++++++++------------------- net/bluetooth/smp.h | 1 - 4 files changed, 66 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 1a037ba4b6f4..bda6252e3722 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -636,7 +636,6 @@ struct l2cap_conn { __u8 disc_reason; struct delayed_work security_timer; - struct smp_chan *smp_chan; struct l2cap_chan *smp; struct list_head chan_l; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 8acfe6f57b5e..40b6c06ab2c0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1651,11 +1651,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); - smp_chan_destroy(conn); - } - hcon->l2cap_data = NULL; conn->hchan = NULL; l2cap_conn_put(conn); @@ -6862,11 +6857,6 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) l2cap_le_sig_channel(conn, skb); break; - case L2CAP_CID_SMP: - if (smp_sig_channel(conn, skb)) - l2cap_conn_del(conn->hcon, EACCES); - break; - default: l2cap_data_channel(conn, cid, skb); break; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 6925fc4caaee..744f678ac3e8 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -248,44 +248,29 @@ static int smp_s1(struct smp_chan *smp, u8 k[16], u8 r1[16], u8 r2[16], return err; } -static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, - u16 dlen, void *data) +static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { - struct sk_buff *skb; - struct l2cap_hdr *lh; - int len; - - len = L2CAP_HDR_SIZE + sizeof(code) + dlen; - - if (len > conn->mtu) - return NULL; + struct l2cap_chan *chan = conn->smp; + struct kvec iv[2]; + struct msghdr msg; - skb = bt_skb_alloc(len, GFP_ATOMIC); - if (!skb) - return NULL; + if (!chan) + return; - lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); - lh->len = cpu_to_le16(sizeof(code) + dlen); - lh->cid = cpu_to_le16(L2CAP_CID_SMP); + BT_DBG("code 0x%2.2x", code); - memcpy(skb_put(skb, sizeof(code)), &code, sizeof(code)); + iv[0].iov_base = &code; + iv[0].iov_len = 1; - memcpy(skb_put(skb, dlen), data, dlen); + iv[1].iov_base = data; + iv[1].iov_len = len; - return skb; -} + memset(&msg, 0, sizeof(msg)); -static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) -{ - struct sk_buff *skb = smp_build_cmd(conn, code, len, data); + msg.msg_iov = (struct iovec *) &iv; + msg.msg_iovlen = 2; - BT_DBG("code 0x%2.2x", code); - - if (!skb) - return; - - skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, 0); + l2cap_chan_send(chan, &msg, 1 + len); cancel_delayed_work_sync(&conn->security_timer); schedule_delayed_work(&conn->security_timer, SMP_TIMEOUT); @@ -315,7 +300,8 @@ static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0; @@ -358,7 +344,8 @@ static void build_pairing_cmd(struct l2cap_conn *conn, static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || (max_key_size < SMP_MIN_ENC_KEY_SIZE)) @@ -418,7 +405,8 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { struct hci_conn *hcon = conn->hcon; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; u8 method; u32 passkey = 0; int ret = 0; @@ -589,6 +577,7 @@ static u8 smp_random(struct smp_chan *smp) static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { + struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; smp = kzalloc(sizeof(*smp), GFP_ATOMIC); @@ -606,7 +595,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) } smp->conn = conn; - conn->smp_chan = smp; + chan->data = smp; hci_conn_hold(conn->hcon); @@ -615,7 +604,8 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) void smp_chan_destroy(struct l2cap_conn *conn) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; bool complete; BUG_ON(!smp); @@ -646,14 +636,15 @@ void smp_chan_destroy(struct l2cap_conn *conn) } } + chan->data = NULL; kfree(smp); - conn->smp_chan = NULL; hci_conn_drop(conn->hcon); } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; struct smp_chan *smp; u32 value; @@ -662,7 +653,11 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) if (!conn || !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) return -ENOTCONN; - smp = conn->smp_chan; + chan = conn->smp; + if (!chan) + return -ENOTCONN; + + smp = chan->data; switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: @@ -709,10 +704,12 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (conn->hcon->role != HCI_ROLE_SLAVE) return SMP_CMD_NOTSUPP; - if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { smp = smp_chan_create(conn); - else - smp = conn->smp_chan; + } else { + struct l2cap_chan *chan = conn->smp; + smp = chan->data; + } if (!smp) return SMP_UNSPECIFIED; @@ -766,7 +763,8 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; u8 key_size, auth = SMP_AUTH_NONE; int ret; @@ -827,7 +825,8 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); @@ -850,7 +849,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p", conn); @@ -1023,7 +1023,8 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p", conn); @@ -1044,7 +1045,8 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_master_ident *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; struct smp_ltk *ltk; @@ -1080,7 +1082,8 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_info *info = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG(""); @@ -1102,7 +1105,8 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_addr_info *info = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; @@ -1156,7 +1160,8 @@ distribute: static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_sign_info *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct smp_csrk *csrk; @@ -1187,8 +1192,9 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) +static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { + struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; __u8 code, reason; int err = 0; @@ -1290,7 +1296,8 @@ done: static void smp_notify_keys(struct l2cap_conn *conn) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req = (void *) &smp->preq[1]; @@ -1357,7 +1364,8 @@ static void smp_notify_keys(struct l2cap_conn *conn) int smp_distribute_keys(struct l2cap_conn *conn) { struct smp_cmd_pairing *req, *rsp; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; __u8 *keydist; @@ -1475,6 +1483,11 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) BT_DBG("chan %p", chan); + if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { + cancel_delayed_work_sync(&conn->security_timer); + smp_chan_destroy(conn); + } + conn->smp = NULL; l2cap_chan_put(chan); } @@ -1508,11 +1521,11 @@ static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, static const struct l2cap_ops smp_chan_ops = { .name = "Security Manager", .ready = smp_ready_cb, + .recv = smp_recv_cb, .alloc_skb = smp_alloc_skb_cb, .teardown = smp_teardown_cb, .new_connection = l2cap_chan_no_new_connection, - .recv = l2cap_chan_no_recv, .state_change = l2cap_chan_no_state_change, .close = l2cap_chan_no_close, .defer = l2cap_chan_no_defer, @@ -1587,10 +1600,7 @@ int smp_register(struct hci_dev *hdev) chan->data = tfm_aes; - /* FIXME: Using reserved 0x1f value for now - to be changed to - * L2CAP_CID_SMP once all functionality is in place. - */ - l2cap_add_scid(chan, 0x1f); + l2cap_add_scid(chan, L2CAP_CID_SMP); l2cap_chan_set_defaults(chan); diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index d3f6bd617940..161ace3c3234 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -126,7 +126,6 @@ enum { /* SMP Commands */ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); int smp_distribute_keys(struct l2cap_conn *conn); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); -- cgit v1.2.3 From 44f1a7ab51ebe1ca189445837e0599a5edc6efb1 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:36 +0300 Subject: Bluetooth: Use L2CAP resume callback to call smp_distribute_keys There's no need to export the smp_distribute_keys() function since the resume callback is called in the same scenario. This patch makes the smp_notify_keys function private (at the same time moving it higher up in smp.c to avoid forward declarations) and adds a resume callback for SMP to call it from there instead. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 6 - net/bluetooth/smp.c | 380 +++++++++++++++++++++++---------------------- net/bluetooth/smp.h | 1 - 3 files changed, 196 insertions(+), 191 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 40b6c06ab2c0..4de1e1827055 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7281,12 +7281,6 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - if (hcon->type == LE_LINK) { - if (!status && encrypt) - smp_distribute_keys(conn); - cancel_delayed_work(&conn->security_timer); - } - mutex_lock(&conn->chan_lock); list_for_each_entry(chan, &conn->chan_l, list) { diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 744f678ac3e8..28014ad3d2d3 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -575,6 +575,189 @@ static u8 smp_random(struct smp_chan *smp) return 0; } +static void smp_notify_keys(struct l2cap_conn *conn) +{ + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + struct smp_cmd_pairing *req = (void *) &smp->preq[1]; + struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; + bool persistent; + + if (smp->remote_irk) { + mgmt_new_irk(hdev, smp->remote_irk); + /* Now that user space can be considered to know the + * identity address track the connection based on it + * from now on. + */ + bacpy(&hcon->dst, &smp->remote_irk->bdaddr); + hcon->dst_type = smp->remote_irk->addr_type; + l2cap_conn_update_id_addr(hcon); + + /* When receiving an indentity resolving key for + * a remote device that does not use a resolvable + * private address, just remove the key so that + * it is possible to use the controller white + * list for scanning. + * + * Userspace will have been told to not store + * this key at this point. So it is safe to + * just remove it. + */ + if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) { + list_del(&smp->remote_irk->list); + kfree(smp->remote_irk); + smp->remote_irk = NULL; + } + } + + /* The LTKs and CSRKs should be persistent only if both sides + * had the bonding bit set in their authentication requests. + */ + persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); + + if (smp->csrk) { + smp->csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->csrk, persistent); + } + + if (smp->slave_csrk) { + smp->slave_csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->slave_csrk, persistent); + } + + if (smp->ltk) { + smp->ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->ltk, persistent); + } + + if (smp->slave_ltk) { + smp->slave_ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->slave_ltk, persistent); + } +} + +static int smp_distribute_keys(struct l2cap_conn *conn) +{ + struct smp_cmd_pairing *req, *rsp; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + __u8 *keydist; + + BT_DBG("conn %p", conn); + + if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + return 0; + + rsp = (void *) &smp->prsp[1]; + + /* The responder sends its keys first */ + if (hcon->out && (smp->remote_key_dist & 0x07)) + return 0; + + req = (void *) &smp->preq[1]; + + if (hcon->out) { + keydist = &rsp->init_key_dist; + *keydist &= req->init_key_dist; + } else { + keydist = &rsp->resp_key_dist; + *keydist &= req->resp_key_dist; + } + + BT_DBG("keydist 0x%x", *keydist); + + if (*keydist & SMP_DIST_ENC_KEY) { + struct smp_cmd_encrypt_info enc; + struct smp_cmd_master_ident ident; + struct smp_ltk *ltk; + u8 authenticated; + __le16 ediv; + __le64 rand; + + get_random_bytes(enc.ltk, sizeof(enc.ltk)); + get_random_bytes(&ediv, sizeof(ediv)); + get_random_bytes(&rand, sizeof(rand)); + + smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); + + authenticated = hcon->sec_level == BT_SECURITY_HIGH; + ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, + SMP_LTK_SLAVE, authenticated, enc.ltk, + smp->enc_key_size, ediv, rand); + smp->slave_ltk = ltk; + + ident.ediv = ediv; + ident.rand = rand; + + smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); + + *keydist &= ~SMP_DIST_ENC_KEY; + } + + if (*keydist & SMP_DIST_ID_KEY) { + struct smp_cmd_ident_addr_info addrinfo; + struct smp_cmd_ident_info idinfo; + + memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); + + smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); + + /* The hci_conn contains the local identity address + * after the connection has been established. + * + * This is true even when the connection has been + * established using a resolvable random address. + */ + bacpy(&addrinfo.bdaddr, &hcon->src); + addrinfo.addr_type = hcon->src_type; + + smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), + &addrinfo); + + *keydist &= ~SMP_DIST_ID_KEY; + } + + if (*keydist & SMP_DIST_SIGN) { + struct smp_cmd_sign_info sign; + struct smp_csrk *csrk; + + /* Generate a new random key */ + get_random_bytes(sign.csrk, sizeof(sign.csrk)); + + csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); + if (csrk) { + csrk->master = 0x00; + memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); + } + smp->slave_csrk = csrk; + + smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); + + *keydist &= ~SMP_DIST_SIGN; + } + + /* If there are still keys to be received wait for them */ + if ((smp->remote_key_dist & 0x07)) + return 0; + + clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); + cancel_delayed_work_sync(&conn->security_timer); + set_bit(SMP_FLAG_COMPLETE, &smp->flags); + smp_notify_keys(conn); + + smp_chan_destroy(conn); + + return 0; +} + static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; @@ -1294,189 +1477,6 @@ done: return err; } -static void smp_notify_keys(struct l2cap_conn *conn) -{ - struct l2cap_chan *chan = conn->smp; - struct smp_chan *smp = chan->data; - struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; - struct smp_cmd_pairing *req = (void *) &smp->preq[1]; - struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; - bool persistent; - - if (smp->remote_irk) { - mgmt_new_irk(hdev, smp->remote_irk); - /* Now that user space can be considered to know the - * identity address track the connection based on it - * from now on. - */ - bacpy(&hcon->dst, &smp->remote_irk->bdaddr); - hcon->dst_type = smp->remote_irk->addr_type; - l2cap_conn_update_id_addr(hcon); - - /* When receiving an indentity resolving key for - * a remote device that does not use a resolvable - * private address, just remove the key so that - * it is possible to use the controller white - * list for scanning. - * - * Userspace will have been told to not store - * this key at this point. So it is safe to - * just remove it. - */ - if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) { - list_del(&smp->remote_irk->list); - kfree(smp->remote_irk); - smp->remote_irk = NULL; - } - } - - /* The LTKs and CSRKs should be persistent only if both sides - * had the bonding bit set in their authentication requests. - */ - persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); - - if (smp->csrk) { - smp->csrk->bdaddr_type = hcon->dst_type; - bacpy(&smp->csrk->bdaddr, &hcon->dst); - mgmt_new_csrk(hdev, smp->csrk, persistent); - } - - if (smp->slave_csrk) { - smp->slave_csrk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); - mgmt_new_csrk(hdev, smp->slave_csrk, persistent); - } - - if (smp->ltk) { - smp->ltk->bdaddr_type = hcon->dst_type; - bacpy(&smp->ltk->bdaddr, &hcon->dst); - mgmt_new_ltk(hdev, smp->ltk, persistent); - } - - if (smp->slave_ltk) { - smp->slave_ltk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); - mgmt_new_ltk(hdev, smp->slave_ltk, persistent); - } -} - -int smp_distribute_keys(struct l2cap_conn *conn) -{ - struct smp_cmd_pairing *req, *rsp; - struct l2cap_chan *chan = conn->smp; - struct smp_chan *smp = chan->data; - struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; - __u8 *keydist; - - BT_DBG("conn %p", conn); - - if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; - - rsp = (void *) &smp->prsp[1]; - - /* The responder sends its keys first */ - if (hcon->out && (smp->remote_key_dist & 0x07)) - return 0; - - req = (void *) &smp->preq[1]; - - if (hcon->out) { - keydist = &rsp->init_key_dist; - *keydist &= req->init_key_dist; - } else { - keydist = &rsp->resp_key_dist; - *keydist &= req->resp_key_dist; - } - - BT_DBG("keydist 0x%x", *keydist); - - if (*keydist & SMP_DIST_ENC_KEY) { - struct smp_cmd_encrypt_info enc; - struct smp_cmd_master_ident ident; - struct smp_ltk *ltk; - u8 authenticated; - __le16 ediv; - __le64 rand; - - get_random_bytes(enc.ltk, sizeof(enc.ltk)); - get_random_bytes(&ediv, sizeof(ediv)); - get_random_bytes(&rand, sizeof(rand)); - - smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); - - authenticated = hcon->sec_level == BT_SECURITY_HIGH; - ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, - SMP_LTK_SLAVE, authenticated, enc.ltk, - smp->enc_key_size, ediv, rand); - smp->slave_ltk = ltk; - - ident.ediv = ediv; - ident.rand = rand; - - smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); - - *keydist &= ~SMP_DIST_ENC_KEY; - } - - if (*keydist & SMP_DIST_ID_KEY) { - struct smp_cmd_ident_addr_info addrinfo; - struct smp_cmd_ident_info idinfo; - - memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); - - smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); - - /* The hci_conn contains the local identity address - * after the connection has been established. - * - * This is true even when the connection has been - * established using a resolvable random address. - */ - bacpy(&addrinfo.bdaddr, &hcon->src); - addrinfo.addr_type = hcon->src_type; - - smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), - &addrinfo); - - *keydist &= ~SMP_DIST_ID_KEY; - } - - if (*keydist & SMP_DIST_SIGN) { - struct smp_cmd_sign_info sign; - struct smp_csrk *csrk; - - /* Generate a new random key */ - get_random_bytes(sign.csrk, sizeof(sign.csrk)); - - csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); - if (csrk) { - csrk->master = 0x00; - memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); - } - smp->slave_csrk = csrk; - - smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); - - *keydist &= ~SMP_DIST_SIGN; - } - - /* If there are still keys to be received wait for them */ - if ((smp->remote_key_dist & 0x07)) - return 0; - - clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); - cancel_delayed_work_sync(&conn->security_timer); - set_bit(SMP_FLAG_COMPLETE, &smp->flags); - smp_notify_keys(conn); - - smp_chan_destroy(conn); - - return 0; -} - static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; @@ -1492,6 +1492,18 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) l2cap_chan_put(chan); } +static void smp_resume_cb(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct hci_conn *hcon = conn->hcon; + + BT_DBG("chan %p", chan); + + if (test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) + smp_distribute_keys(conn); + cancel_delayed_work(&conn->security_timer); +} + static void smp_ready_cb(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; @@ -1524,13 +1536,13 @@ static const struct l2cap_ops smp_chan_ops = { .recv = smp_recv_cb, .alloc_skb = smp_alloc_skb_cb, .teardown = smp_teardown_cb, + .resume = smp_resume_cb, .new_connection = l2cap_chan_no_new_connection, .state_change = l2cap_chan_no_state_change, .close = l2cap_chan_no_close, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, - .resume = l2cap_chan_no_resume, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 161ace3c3234..59a594278a85 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -126,7 +126,6 @@ enum { /* SMP Commands */ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -int smp_distribute_keys(struct l2cap_conn *conn); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); void smp_chan_destroy(struct l2cap_conn *conn); -- cgit v1.2.3 From dec5b49235e2526d7aacf5b93ea48f5e30c2f7c3 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:37 +0300 Subject: Bluetooth: Add public l2cap_conn_shutdown() API to request disconnection Since we no-longer do special handling of SMP within l2cap_core.c we don't have any code for calling l2cap_conn_del() when smp.c doesn't like the data it gets. At the same time we cannot simply export l2cap_conn_del() since it will try to lock the channels it calls into whereas we already hold the lock in the smp.c l2cap_chan callbacks (i.e. it'd lead to a deadlock). This patch adds a new l2cap_conn_shutdown() API which is very similar to l2cap_conn_del() except that it defers the call to l2cap_conn_del() through a workqueue, thereby making it safe to use it from an L2CAP channel callback. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 4 ++++ net/bluetooth/l2cap_core.c | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index bda6252e3722..40f34866b6da 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -625,6 +625,9 @@ struct l2cap_conn { struct delayed_work info_timer; + int disconn_err; + struct work_struct disconn_work; + struct sk_buff *rx_skb; __u32 rx_len; __u8 tx_ident; @@ -944,6 +947,7 @@ void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, u8 status); void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); +void l2cap_conn_shutdown(struct l2cap_conn *conn, int err); void l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 4de1e1827055..404998efa7e2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1627,6 +1627,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + if (work_pending(&conn->disconn_work)) + cancel_work_sync(&conn->disconn_work); + l2cap_unregister_all_users(conn); mutex_lock(&conn->chan_lock); @@ -1669,6 +1672,26 @@ static void security_timeout(struct work_struct *work) } } +static void disconn_work(struct work_struct *work) +{ + struct l2cap_conn *conn = container_of(work, struct l2cap_conn, + disconn_work); + + BT_DBG("conn %p", conn); + + l2cap_conn_del(conn->hcon, conn->disconn_err); +} + +void l2cap_conn_shutdown(struct l2cap_conn *conn, int err) +{ + struct hci_dev *hdev = conn->hcon->hdev; + + BT_DBG("conn %p err %d", conn, err); + + conn->disconn_err = err; + queue_work(hdev->workqueue, &conn->disconn_work); +} + static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); @@ -6930,6 +6953,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) else INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + INIT_WORK(&conn->disconn_work, disconn_work); + skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); -- cgit v1.2.3 From 4befb867b9de8adc56c683f4cf6c9e6c035e94e3 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:38 +0300 Subject: Bluetooth: Call l2cap_conn_shutdown() when SMP recv callback fails To restore pre-l2cap_chan functionality we should be trying to disconnect the connection when receviving garbage SMP data (i.e. when the SMP command handler fails). This patch renames the command handler back to smp_sig_channel() and adds a smp_recv_cb() wrapper function for calling it. If smp_sig_channel() fails the code calls l2cap_conn_shutdown(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 28014ad3d2d3..7a295d7edc44 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1375,7 +1375,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) +static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; @@ -1514,6 +1514,24 @@ static void smp_ready_cb(struct l2cap_chan *chan) l2cap_chan_hold(chan); } +static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) +{ + int err; + + BT_DBG("chan %p", chan); + + err = smp_sig_channel(chan, skb); + if (err) { + struct l2cap_conn *conn = chan->conn; + + cancel_delayed_work_sync(&conn->security_timer); + + l2cap_conn_shutdown(chan->conn, -err); + } + + return err; +} + static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) -- cgit v1.2.3 From 8ae9b9845b3252216cf5d2e033e5cca41bae48ef Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:39 +0300 Subject: Bluetooth: Fix double free of SMP data skb In the case that the SMP recv callback returns error the calling code in l2cap_core.c expects that it still owns the skb and will try to free it. The SMP code should therefore not try to free the skb if it return an error. This patch fixes such behavior in the SMP command handler function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 7a295d7edc44..5103dc739a17 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1387,10 +1387,8 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) return 0; } - if (skb->len < 1) { - kfree_skb(skb); + if (skb->len < 1) return -EILSEQ; - } if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) { err = -EOPNOTSUPP; @@ -1410,8 +1408,9 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); - kfree_skb(skb); - return -EOPNOTSUPP; + reason = SMP_CMD_NOTSUPP; + err = -EOPNOTSUPP; + goto done; } switch (code) { @@ -1472,8 +1471,8 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) done: if (reason) smp_failure(conn, reason); - - kfree_skb(skb); + if (!err) + kfree_skb(skb); return err; } -- cgit v1.2.3 From b68fda6848ebef3499905500971d40b84faa8319 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:40 +0300 Subject: Bluetooth: Add SMP-internal timeout callback This patch adds an SMP-internal timeout callback to remove the depenency on (the soon to be removed) l2cap_conn->security_timer. The behavior is the same as with l2cap_conn->security_timer except that the new l2cap_conn_shutdown() public function is used instead of the L2CAP core internal l2cap_conn_del(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 5103dc739a17..7ab5f52955cb 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -44,7 +44,9 @@ enum { }; struct smp_chan { - struct l2cap_conn *conn; + struct l2cap_conn *conn; + struct delayed_work security_timer; + u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ u8 prnd[16]; /* SMP Pairing Random (local) */ @@ -251,6 +253,7 @@ static int smp_s1(struct smp_chan *smp, u8 k[16], u8 r1[16], u8 r2[16], static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp; struct kvec iv[2]; struct msghdr msg; @@ -272,8 +275,14 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) l2cap_chan_send(chan, &msg, 1 + len); - cancel_delayed_work_sync(&conn->security_timer); - schedule_delayed_work(&conn->security_timer, SMP_TIMEOUT); + if (!chan->data) + return; + + smp = chan->data; + + cancel_delayed_work_sync(&smp->security_timer); + if (test_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); } static __u8 authreq_to_seclevel(__u8 authreq) @@ -359,6 +368,8 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), @@ -368,7 +379,12 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) mgmt_auth_failed(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, HCI_ERROR_AUTH_FAILURE); - cancel_delayed_work_sync(&conn->security_timer); + if (!chan->data) + return; + + smp = chan->data; + + cancel_delayed_work_sync(&smp->security_timer); if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) smp_chan_destroy(conn); @@ -749,7 +765,7 @@ static int smp_distribute_keys(struct l2cap_conn *conn) return 0; clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); - cancel_delayed_work_sync(&conn->security_timer); + cancel_delayed_work_sync(&smp->security_timer); set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); @@ -758,6 +774,17 @@ static int smp_distribute_keys(struct l2cap_conn *conn) return 0; } +static void smp_timeout(struct work_struct *work) +{ + struct smp_chan *smp = container_of(work, struct smp_chan, + security_timer.work); + struct l2cap_conn *conn = smp->conn; + + BT_DBG("conn %p", conn); + + l2cap_conn_shutdown(conn, ETIMEDOUT); +} + static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; @@ -780,6 +807,8 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->conn = conn; chan->data = smp; + INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); + hci_conn_hold(conn->hcon); return smp; @@ -1479,11 +1508,12 @@ done: static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; + struct smp_chan *smp = chan->data; BT_DBG("chan %p", chan); if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); + cancel_delayed_work_sync(&smp->security_timer); smp_chan_destroy(conn); } @@ -1493,6 +1523,7 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) static void smp_resume_cb(struct l2cap_chan *chan) { + struct smp_chan *smp = chan->data; struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; @@ -1500,7 +1531,9 @@ static void smp_resume_cb(struct l2cap_chan *chan) if (test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) smp_distribute_keys(conn); - cancel_delayed_work(&conn->security_timer); + + if (smp) + cancel_delayed_work(&smp->security_timer); } static void smp_ready_cb(struct l2cap_chan *chan) @@ -1521,9 +1554,10 @@ static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) err = smp_sig_channel(chan, skb); if (err) { - struct l2cap_conn *conn = chan->conn; + struct smp_chan *smp = chan->data; - cancel_delayed_work_sync(&conn->security_timer); + if (smp) + cancel_delayed_work_sync(&smp->security_timer); l2cap_conn_shutdown(chan->conn, -err); } -- cgit v1.2.3 From 276d807317dead63ef2f13aa46e3c17d57ba0713 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:41 +0300 Subject: Bluetooth: Remove unused l2cap_conn->security_timer Now that there are no-longer any users for l2cap_conn->security_timer we can go ahead and simply remove it. The patch makes initialization of the conn->info_timer unconditional since it's better not to leave any l2cap_conn data structures uninitialized no matter what the underlying transport. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 - net/bluetooth/l2cap_core.c | 18 +----------------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 40f34866b6da..cedda399f9c0 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -638,7 +638,6 @@ struct l2cap_conn { __u8 disc_reason; - struct delayed_work security_timer; struct l2cap_chan *smp; struct list_head chan_l; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 404998efa7e2..0cd7ed99558b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1659,19 +1659,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_conn_put(conn); } -static void security_timeout(struct work_struct *work) -{ - struct l2cap_conn *conn = container_of(work, struct l2cap_conn, - security_timer.work); - - BT_DBG("conn %p", conn); - - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - smp_chan_destroy(conn); - l2cap_conn_del(conn->hcon, ETIMEDOUT); - } -} - static void disconn_work(struct work_struct *work) { struct l2cap_conn *conn = container_of(work, struct l2cap_conn, @@ -6948,10 +6935,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); - if (hcon->type == LE_LINK) - INIT_DELAYED_WORK(&conn->security_timer, security_timeout); - else - INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); INIT_WORK(&conn->disconn_work, disconn_work); -- cgit v1.2.3 From 109ec2309eb996fbe03302fbd40dec9014c6f849 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:42 +0300 Subject: Bluetooth: Move canceling security_timer into smp_chan_destroy() All places needing to cancel the security timer also call smp_chan_destroy() in the same go. To eliminate the need to do these two calls in multiple places simply move the timer cancellation into smp_chan_destroy(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 7ab5f52955cb..cdae40869447 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -384,8 +384,6 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) smp = chan->data; - cancel_delayed_work_sync(&smp->security_timer); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) smp_chan_destroy(conn); } @@ -765,7 +763,6 @@ static int smp_distribute_keys(struct l2cap_conn *conn) return 0; clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); - cancel_delayed_work_sync(&smp->security_timer); set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); @@ -822,6 +819,11 @@ void smp_chan_destroy(struct l2cap_conn *conn) BUG_ON(!smp); + cancel_delayed_work_sync(&smp->security_timer); + /* In case the timeout freed the SMP context */ + if (!chan->data) + return; + complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(conn->hcon, complete); @@ -1512,10 +1514,8 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) BT_DBG("chan %p", chan); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - cancel_delayed_work_sync(&smp->security_timer); + if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp_chan_destroy(conn); - } conn->smp = NULL; l2cap_chan_put(chan); -- cgit v1.2.3 From 86d1407cb9cd3cb866eae24a2aedb8006160db69 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:43 +0300 Subject: Bluetooth: Always call smp_distribute_keys() from a workqueue The smp_distribute_keys() function calls smp_notify_keys() which in turn calls l2cap_conn_update_id_addr(). The l2cap_conn_update_id_addr() function will iterate through all L2CAP channels for the respective connection: lock the channel, update the address information and unlock the channel. Since SMP is now using l2cap_chan callbacks each callback is called with the channel lock held. Therefore, calling l2cap_conn_update_id_addr() would cause a deadlock calling l2cap_chan_lock() on the SMP channel. This patch moves calling smp_distribute_keys() through a workqueue so that it is never called from an L2CAP channel callback. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index cdae40869447..312ea5ec2d7d 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -46,6 +46,7 @@ enum { struct smp_chan { struct l2cap_conn *conn; struct delayed_work security_timer; + struct work_struct distribute_work; u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ @@ -656,11 +657,12 @@ static void smp_notify_keys(struct l2cap_conn *conn) } } -static int smp_distribute_keys(struct l2cap_conn *conn) +static void smp_distribute_keys(struct work_struct *work) { + struct smp_chan *smp = container_of(work, struct smp_chan, + distribute_work); struct smp_cmd_pairing *req, *rsp; - struct l2cap_chan *chan = conn->smp; - struct smp_chan *smp = chan->data; + struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; __u8 *keydist; @@ -668,13 +670,13 @@ static int smp_distribute_keys(struct l2cap_conn *conn) BT_DBG("conn %p", conn); if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; + return; rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ if (hcon->out && (smp->remote_key_dist & 0x07)) - return 0; + return; req = (void *) &smp->preq[1]; @@ -760,15 +762,13 @@ static int smp_distribute_keys(struct l2cap_conn *conn) /* If there are still keys to be received wait for them */ if ((smp->remote_key_dist & 0x07)) - return 0; + return; clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); smp_chan_destroy(conn); - - return 0; } static void smp_timeout(struct work_struct *work) @@ -804,6 +804,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->conn = conn; chan->data = smp; + INIT_WORK(&smp->distribute_work, smp_distribute_keys); INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); hci_conn_hold(conn->hcon); @@ -824,6 +825,12 @@ void smp_chan_destroy(struct l2cap_conn *conn) if (!chan->data) return; + if (work_pending(&smp->distribute_work)) { + cancel_work_sync(&smp->distribute_work); + if (!chan->data) + return; + } + complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(conn->hcon, complete); @@ -1287,7 +1294,7 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) rp->ediv, rp->rand); smp->ltk = ltk; if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - smp_distribute_keys(conn); + queue_work(hdev->workqueue, &smp->distribute_work); hci_dev_unlock(hdev); return 0; @@ -1322,6 +1329,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; bdaddr_t rpa; BT_DBG(""); @@ -1364,7 +1372,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, smp->id_addr_type, smp->irk, &rpa); distribute: - smp_distribute_keys(conn); + queue_work(hdev->workqueue, &smp->distribute_work); hci_dev_unlock(hcon->hdev); @@ -1400,7 +1408,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; - smp_distribute_keys(conn); + queue_work(hdev->workqueue, &smp->distribute_work); hci_dev_unlock(hdev); return 0; @@ -1510,7 +1518,6 @@ done: static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; - struct smp_chan *smp = chan->data; BT_DBG("chan %p", chan); @@ -1526,14 +1533,17 @@ static void smp_resume_cb(struct l2cap_chan *chan) struct smp_chan *smp = chan->data; struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; BT_DBG("chan %p", chan); - if (test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) - smp_distribute_keys(conn); + if (!smp) + return; - if (smp) - cancel_delayed_work(&smp->security_timer); + cancel_delayed_work(&smp->security_timer); + + if (test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) + queue_work(hdev->workqueue, &smp->distribute_work); } static void smp_ready_cb(struct l2cap_chan *chan) -- cgit v1.2.3 From 6f48e260a95c1a0161e5be39adb0f20c737fe459 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 11 Aug 2014 22:06:44 +0300 Subject: Bluetooth: Make smp_chan_destroy() private to smp.c There are no external users of smp_chan_destroy() so make it private to smp.c. The patch also moves the function higher up in the c-file in order to avoid forward declarations. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 100 ++++++++++++++++++++++++++-------------------------- net/bluetooth/smp.h | 2 -- 2 files changed, 50 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 312ea5ec2d7d..07ca4ce0943b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -366,6 +366,56 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) return 0; } +static void smp_chan_destroy(struct l2cap_conn *conn) +{ + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; + bool complete; + + BUG_ON(!smp); + + cancel_delayed_work_sync(&smp->security_timer); + /* In case the timeout freed the SMP context */ + if (!chan->data) + return; + + if (work_pending(&smp->distribute_work)) { + cancel_work_sync(&smp->distribute_work); + if (!chan->data) + return; + } + + complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); + mgmt_smp_complete(conn->hcon, complete); + + kfree(smp->csrk); + kfree(smp->slave_csrk); + + crypto_free_blkcipher(smp->tfm_aes); + + /* If pairing failed clean up any keys we might have */ + if (!complete) { + if (smp->ltk) { + list_del(&smp->ltk->list); + kfree(smp->ltk); + } + + if (smp->slave_ltk) { + list_del(&smp->slave_ltk->list); + kfree(smp->slave_ltk); + } + + if (smp->remote_irk) { + list_del(&smp->remote_irk->list); + kfree(smp->remote_irk); + } + } + + chan->data = NULL; + kfree(smp); + hci_conn_drop(conn->hcon); +} + static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; @@ -812,56 +862,6 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) return smp; } -void smp_chan_destroy(struct l2cap_conn *conn) -{ - struct l2cap_chan *chan = conn->smp; - struct smp_chan *smp = chan->data; - bool complete; - - BUG_ON(!smp); - - cancel_delayed_work_sync(&smp->security_timer); - /* In case the timeout freed the SMP context */ - if (!chan->data) - return; - - if (work_pending(&smp->distribute_work)) { - cancel_work_sync(&smp->distribute_work); - if (!chan->data) - return; - } - - complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); - mgmt_smp_complete(conn->hcon, complete); - - kfree(smp->csrk); - kfree(smp->slave_csrk); - - crypto_free_blkcipher(smp->tfm_aes); - - /* If pairing failed clean up any keys we might have */ - if (!complete) { - if (smp->ltk) { - list_del(&smp->ltk->list); - kfree(smp->ltk); - } - - if (smp->slave_ltk) { - list_del(&smp->slave_ltk->list); - kfree(smp->slave_ltk); - } - - if (smp->remote_irk) { - list_del(&smp->remote_irk->list); - kfree(smp->remote_irk); - } - } - - chan->data = NULL; - kfree(smp); - hci_conn_drop(conn->hcon); -} - int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = hcon->l2cap_data; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 59a594278a85..cf1094617c69 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -128,8 +128,6 @@ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); -void smp_chan_destroy(struct l2cap_conn *conn); - bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr); int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa); -- cgit v1.2.3 From 24bbd44a96c7a209fafbf1b28f0ac1a00cf4e551 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Mon, 11 Aug 2014 13:25:07 +0200 Subject: mac802154: cleanup in rx path This patch replace the sizeof(struct rx_work) with sizeof(*work) and directly passing the skb in mac802154_subif_rx() Signed-off-by: Varka Bhadram Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/rx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 7f820a108a9c..a14cf9ede171 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -86,9 +86,8 @@ fail: static void mac802154_rx_worker(struct work_struct *work) { struct rx_work *rw = container_of(work, struct rx_work, work); - struct sk_buff *skb = rw->skb; - mac802154_subif_rx(rw->dev, skb, rw->lqi); + mac802154_subif_rx(rw->dev, rw->skb, rw->lqi); kfree(rw); } @@ -101,7 +100,7 @@ ieee802154_rx_irqsafe(struct ieee802154_dev *dev, struct sk_buff *skb, u8 lqi) if (!skb) return; - work = kzalloc(sizeof(struct rx_work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; -- cgit v1.2.3 From b288a4963f09459c29cca240d3352cc7915710fc Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Mon, 11 Aug 2014 13:25:08 +0200 Subject: mac802154: common error path By introducing label fail, making the common error path for mac802154_llsec_decrypt() and packet type default case. Signed-off-by: Varka Bhadram Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/wpan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c index 3c3069fd6971..ce1757001917 100644 --- a/net/mac802154/wpan.c +++ b/net/mac802154/wpan.c @@ -472,8 +472,7 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, rc = mac802154_llsec_decrypt(&sdata->sec, skb); if (rc) { pr_debug("decryption failed: %i\n", rc); - kfree_skb(skb); - return NET_RX_DROP; + goto fail; } sdata->dev->stats.rx_packets++; @@ -485,9 +484,12 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, default: pr_warn("ieee802154: bad frame received (type = %d)\n", mac_cb(skb)->type); - kfree_skb(skb); - return NET_RX_DROP; + goto fail; } + +fail: + kfree_skb(skb); + return NET_RX_DROP; } static void mac802154_print_addr(const char *name, -- cgit v1.2.3 From 0ba1f94e72b811215ce2f4610fe0f6cf88f4b28a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 11 Aug 2014 13:25:09 +0200 Subject: ieee802154: 6lowpan: remove unused function Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan_rtnl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c index 016b77ee88f0..509d7a625a4b 100644 --- a/net/ieee802154/6lowpan_rtnl.c +++ b/net/ieee802154/6lowpan_rtnl.c @@ -77,14 +77,6 @@ lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) return netdev_priv(dev); } -static inline void lowpan_address_flip(u8 *src, u8 *dest) -{ - int i; - - for (i = 0; i < IEEE802154_ADDR_LEN; i++) - (dest)[IEEE802154_ADDR_LEN - i - 1] = (src)[i]; -} - static int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) -- cgit v1.2.3 From f55889128a776b51581394b20abd0b470304cf95 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Mon, 11 Aug 2014 13:25:10 +0200 Subject: mac802154: common tx error path This patch introduce the common error path on failure of Tx by inserting the label 'err_tx'. Signed-off-by: Varka Bhadram Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/tx.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 8124353646ae..fdf4c0e67259 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -89,8 +89,7 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, if (!(priv->phy->channels_supported[page] & (1 << chan))) { WARN_ON(1); - kfree_skb(skb); - return NETDEV_TX_OK; + goto err_tx; } mac802154_monitors_rx(mac802154_to_priv(&priv->hw), skb); @@ -103,12 +102,10 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, data[1] = crc >> 8; } - if (skb_cow_head(skb, priv->hw.extra_tx_headroom)) { - kfree_skb(skb); - return NETDEV_TX_OK; - } + if (skb_cow_head(skb, priv->hw.extra_tx_headroom)) + goto err_tx; - work = kzalloc(sizeof(struct xmit_work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { kfree_skb(skb); return NETDEV_TX_BUSY; @@ -129,4 +126,8 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, queue_work(priv->dev_workqueue, &work->work); return NETDEV_TX_OK; + +err_tx: + kfree_skb(skb); + return NETDEV_TX_OK; } -- cgit v1.2.3 From 069cb27017de6476d47a70fbf144f69200d3e854 Mon Sep 17 00:00:00 2001 From: Lukasz Rymanowski Date: Wed, 13 Aug 2014 16:01:41 +0200 Subject: Bluetooth: Improve data packing in SAR mode There is no need to decrease pdu size with L2CAP SDU lenght in Start L2CAP SDU frame. Start packtet is just 2 bytes longer as specified and we can keep payload as long as possible. When testing SAR L2CAP against PTS, L2CAP channel is usually configured in that way, that SDU = MPS * 3. PTS expets then 3 I-Frames from IUT: Start, Continuation and End frame. Without this fix, we sent 4 I-Frames. We could pass a test by using -b option in l2test and send just two bytes less than SDU length. With this patch no need to use -b option. Signed-off-by: Lukasz Rymanowski Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 0cd7ed99558b..ebe7454362f0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2275,7 +2275,6 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, } else { sar = L2CAP_SAR_START; sdu_len = len; - pdu_len -= L2CAP_SDULEN_SIZE; } while (len > 0) { @@ -2290,10 +2289,8 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, __skb_queue_tail(seg_queue, skb); len -= pdu_len; - if (sdu_len) { + if (sdu_len) sdu_len = 0; - pdu_len += L2CAP_SDULEN_SIZE; - } if (len <= pdu_len) { sar = L2CAP_SAR_END; -- cgit v1.2.3 From 13cac15296afe7e42088ecfcd0f1d4b658248c46 Mon Sep 17 00:00:00 2001 From: Lukasz Rymanowski Date: Thu, 14 Aug 2014 09:35:34 +0200 Subject: Bluetooth: Fix ERTM L2CAP resend packet I-Frame which is going to be resend already has FCS field added and set (if it was required). Adding additional FCS field calculated from data + old FCS in resend function is incorrect. This patch fix that. Issue has been found during PTS testing. Signed-off-by: Lukasz Rymanowski Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ebe7454362f0..4a90438d99df 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1968,10 +1968,12 @@ static void l2cap_ertm_resend(struct l2cap_chan *chan) tx_skb->data + L2CAP_HDR_SIZE); } + /* Update FCS */ if (chan->fcs == L2CAP_FCS_CRC16) { - u16 fcs = crc16(0, (u8 *) tx_skb->data, tx_skb->len); - put_unaligned_le16(fcs, skb_put(tx_skb, - L2CAP_FCS_SIZE)); + u16 fcs = crc16(0, (u8 *) tx_skb->data, + tx_skb->len - L2CAP_FCS_SIZE); + put_unaligned_le16(fcs, skb_tail_pointer(tx_skb) - + L2CAP_FCS_SIZE); } l2cap_do_send(chan, tx_skb); -- cgit v1.2.3 From a74a8c846fb699f3277c0c21278bd4c414074b4a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 Jul 2014 14:50:47 +0200 Subject: mac80211: don't duplicate station QoS capability data We currently track the QoS capability twice: for all peer stations in the WLAN_STA_WME flag, and for any clients associated to an AP interface separately for drivers in the sta->sta.wme field. Remove the WLAN_STA_WME flag and track the capability only in the driver-visible field, getting rid of the limitation that the field is only valid in AP mode. Reviewed-by: Arik Nemtsov Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- net/mac80211/cfg.c | 13 +++---------- net/mac80211/debugfs_sta.c | 3 ++- net/mac80211/ibss.c | 2 +- net/mac80211/mesh_plink.c | 4 +--- net/mac80211/mlme.c | 3 +-- net/mac80211/sta_info.c | 4 ++-- net/mac80211/sta_info.h | 2 -- net/mac80211/tdls.c | 3 +-- net/mac80211/tx.c | 6 +++--- net/mac80211/wme.c | 4 ++-- 11 files changed, 17 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index dae2e24616e1..1cd84444665c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1405,7 +1405,7 @@ struct ieee80211_sta_rates { * @supp_rates: Bitmap of supported rates (per band) * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities - * @wme: indicates whether the STA supports WME. Only valid during AP-mode. + * @wme: indicates whether the STA supports QoS/WME. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 927b4ea0128b..4d8989b87960 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1011,15 +1011,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } - if (mask & BIT(NL80211_STA_FLAG_WME)) { - if (set & BIT(NL80211_STA_FLAG_WME)) { - set_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = true; - } else { - clear_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = false; - } - } + if (mask & BIT(NL80211_STA_FLAG_WME)) + sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); if (mask & BIT(NL80211_STA_FLAG_MFP)) { if (set & BIT(NL80211_STA_FLAG_MFP)) @@ -3352,7 +3345,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, band = chanctx_conf->def.chan->band; sta = sta_info_get_bss(sdata, peer); if (sta) { - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; } else { rcu_read_unlock(); return -ENOLINK; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 3db96648b45a..4a20fb8f1e23 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -77,7 +77,8 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, TEST(AUTH), TEST(ASSOC), TEST(PS_STA), TEST(PS_DRIVER), TEST(AUTHORIZED), TEST(SHORT_PREAMBLE), - TEST(WME), TEST(WDS), TEST(CLEAR_PS_FILT), + sta->sta.wme ? "WME\n" : "", + TEST(WDS), TEST(CLEAR_PS_FILT), TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL), TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), TEST(TDLS_PEER_AUTH), TEST(4ADDR_EVENT), diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9713dc54ea4b..5f9654d31a8d 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1038,7 +1038,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, } if (sta && elems->wmm_info) - set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = true; if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 63b874101b27..8d67c1ebfbda 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -431,14 +431,12 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) return NULL; sta->plink_state = NL80211_PLINK_LISTEN; + sta->sta.wme = true; sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - set_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = true; - return sta; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 31a8afaf7332..0e8d59a0e17e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2677,8 +2677,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) set_sta_flag(sta, WLAN_STA_MFP); - if (elems.wmm_param) - set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = elems.wmm_param; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index c6ee2139fbc5..e1f957d5935e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1179,7 +1179,7 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; - bool qos = test_sta_flag(sta, WLAN_STA_WME); + bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; @@ -1834,7 +1834,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); - if (test_sta_flag(sta, WLAN_STA_WME)) + if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index d411bcc8ef08..89c40d5c0633 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -31,7 +31,6 @@ * when virtual port control is not in use. * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble * frames. - * @WLAN_STA_WME: Station is a QoS-STA. * @WLAN_STA_WDS: Station is one of our WDS peers. * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next @@ -69,7 +68,6 @@ enum ieee80211_sta_info_flags { WLAN_STA_PS_STA, WLAN_STA_AUTHORIZED, WLAN_STA_SHORT_PREAMBLE, - WLAN_STA_WME, WLAN_STA_WDS, WLAN_STA_CLEAR_PS_FILT, WLAN_STA_MFP, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 1b21050be174..f2cb3b6c1871 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -316,8 +316,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, } /* add the QoS param IE if both the peer and we support it */ - if (local->hw.queues >= IEEE80211_NUM_ACS && - test_sta_flag(sta, WLAN_STA_WME)) + if (local->hw.queues >= IEEE80211_NUM_ACS && sta->sta.wme) ieee80211_tdls_add_wmm_param_ie(sdata, skb); /* add any custom IEs that go before HT operation */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 464106c023d8..2051cc60f8ce 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1844,7 +1844,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; } ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); @@ -1957,7 +1957,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER); tdls_auth = test_sta_flag(sta, @@ -2035,7 +2035,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, sta = sta_info_get(sdata, hdr.addr1); if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; } } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index d51422c778de..6459946f0b74 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -118,7 +118,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) { - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; break; } case NL80211_IFTYPE_AP: @@ -145,7 +145,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, if (!sta && ra && !is_multicast_ether_addr(ra)) { sta = sta_info_get(sdata, ra); if (sta) - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; } rcu_read_unlock(); -- cgit v1.2.3 From 53b954ee4a71e782d7dfcdeee5bf4695caeeb112 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Thu, 24 Jul 2014 11:20:05 +0300 Subject: mac80211: disable 40MHz support in case of 20MHz AP If the AP only advertises support for 20MHz (in the ht operation ie), disable 40MHz and VHT. This can improve interoperability with APs that don't like stations exceeding their own advertised capabilities. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0e8d59a0e17e..29fe91d6a094 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -149,6 +149,7 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, + const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, struct cfg80211_chan_def *chandef, bool tracking) @@ -162,13 +163,19 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, chandef->center_freq1 = channel->center_freq; chandef->center_freq2 = 0; - if (!ht_oper || !sband->ht_cap.ht_supported) { + if (!ht_cap || !ht_oper || !sband->ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; goto out; } chandef->width = NL80211_CHAN_WIDTH_20; + if (!(ht_cap->cap_info & + cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40))) { + ret = IEEE80211_STA_DISABLE_40MHZ | IEEE80211_STA_DISABLE_VHT; + goto out; + } + ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ @@ -328,6 +335,7 @@ out: static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, + const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const u8 *bssid, u32 *changed) @@ -367,8 +375,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[chan->band]; /* calculate new channel (type) based on HT/VHT operation IEs */ - flags = ieee80211_determine_chantype(sdata, sband, chan, ht_oper, - vht_oper, &chandef, true); + flags = ieee80211_determine_chantype(sdata, sband, chan, + ht_cap, ht_oper, vht_oper, + &chandef, true); /* * Downgrade the new channel if we associated with restricted @@ -3173,7 +3182,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, bssid); - if (ieee80211_config_bw(sdata, sta, elems.ht_operation, + if (ieee80211_config_bw(sdata, sta, + elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, @@ -3807,6 +3817,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + const struct ieee80211_ht_cap *ht_cap = NULL; const struct ieee80211_ht_operation *ht_oper = NULL; const struct ieee80211_vht_operation *vht_oper = NULL; struct ieee80211_supported_band *sband; @@ -3823,14 +3834,17 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && sband->ht_cap.ht_supported) { - const u8 *ht_oper_ie, *ht_cap; + const u8 *ht_oper_ie, *ht_cap_ie; ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); if (ht_oper_ie && ht_oper_ie[1] >= sizeof(*ht_oper)) ht_oper = (void *)(ht_oper_ie + 2); - ht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY); - if (!ht_cap || ht_cap[1] < sizeof(struct ieee80211_ht_cap)) { + ht_cap_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY); + if (ht_cap_ie && ht_cap_ie[1] >= sizeof(*ht_cap)) + ht_cap = (void *)(ht_cap_ie + 2); + + if (!ht_cap) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ht_oper = NULL; } @@ -3861,7 +3875,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_oper, vht_oper, + ht_cap, ht_oper, vht_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), -- cgit v1.2.3 From 1dced6a854827eb5683f3c57ddbb4595daf145e4 Mon Sep 17 00:00:00 2001 From: Sébastien Barré Date: Sun, 17 Aug 2014 09:19:54 +0200 Subject: ipv4: Restore accept_local behaviour in fib_validate_source() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7a9bc9b81a5b ("ipv4: Elide fib_validate_source() completely when possible.") introduced a short-circuit to avoid calling fib_validate_source when not needed. That change took rp_filter into account, but not accept_local. This resulted in a change of behaviour: with rp_filter and accept_local off, incoming packets with a local address in the source field should be dropped. Here is how to reproduce the change pre/post 7a9bc9b81a5b commit: -configure the same IPv4 address on hosts A and B. -try to send an ARP request from B to A. -The ARP request will be dropped before that commit, but accepted and answered after that commit. This adds a check for ACCEPT_LOCAL, to maintain full fib validation in case it is 0. We also leave __fib_validate_source() earlier when possible, based on the same check as fib_validate_source(), once the accept_local stuff is verified. Cc: Gregory Detal Cc: Christoph Paasch Cc: Hannes Frederic Sowa Cc: Sergei Shtylyov Signed-off-by: Sébastien Barré Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 255aa9946fe7..23104a3f2924 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -243,7 +243,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { - int ret, no_addr, accept_local; + int ret, no_addr; struct fib_result res; struct flowi4 fl4; struct net *net; @@ -258,16 +258,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, no_addr = idev->ifa_list == NULL; - accept_local = IN_DEV_ACCEPT_LOCAL(idev); fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); if (fib_lookup(net, &fl4, &res)) goto last_resort; - if (res.type != RTN_UNICAST) { - if (res.type != RTN_LOCAL || !accept_local) - goto e_inval; - } + if (res.type != RTN_UNICAST && + (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) + goto e_inval; + if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && + (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) + goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -321,6 +322,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); if (!r && !fib_num_tclassid_users(dev_net(dev)) && + IN_DEV_ACCEPT_LOCAL(idev) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { *itag = 0; return 0; -- cgit v1.2.3 From e6b688838e25b22e10dd273b48581cb2f62ec16e Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Sun, 17 Aug 2014 15:49:41 +0300 Subject: net/ipv4/igmp.c: Replace rcu_dereference() with rcu_access_pointer() The "rcu_dereference()" call is used directly in a condition. Since its return value is never dereferenced it is recommended to use "rcu_access_pointer()" instead of "rcu_dereference()". Therefore, this patch makes the replacement. The following Coccinelle semantic patch was used: @@ @@ ( if( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} | while( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} ) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f10eab462282..890c4258804c 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2539,7 +2539,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) querier = "NONE"; #endif - if (rcu_dereference(state->in_dev->mc_list) == im) { + if (rcu_access_pointer(state->in_dev->mc_list) == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } -- cgit v1.2.3 From 8c6b00c816191ded80d1ccd5164b53168255ec15 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Sun, 17 Aug 2014 16:29:43 +0300 Subject: net/openvswitch/flow.c: Replace rcu_dereference() with rcu_access_pointer() The "rcu_dereference()" call is used directly in a condition. Since its return value is never dereferenced it is recommended to use "rcu_access_pointer()" instead of "rcu_dereference()". Therefore, this patch makes the replacement. The following Coccinelle semantic patch was used: @@ @@ ( if( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} | while( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} ) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index d07ab538fc9d..7064da92f420 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -89,7 +89,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, * allocated stats as we have already locked them. */ if (likely(flow->stats_last_writer != NUMA_NO_NODE) - && likely(!rcu_dereference(flow->stats[node]))) { + && likely(!rcu_access_pointer(flow->stats[node]))) { /* Try to allocate node-specific stats. */ struct flow_stats *new_stats; -- cgit v1.2.3 From 0932997e34bad52353c25756c55ccf97c522ae7c Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 16:06:09 +0300 Subject: br_multicast: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 7751c92c8c57..648d79ccf462 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1822,7 +1822,7 @@ static void br_multicast_query_expired(struct net_bridge *br, if (query->startup_sent < br->multicast_startup_query_count) query->startup_sent++; - rcu_assign_pointer(querier, NULL); + RCU_INIT_POINTER(querier, NULL); br_multicast_send_query(br, NULL, query); spin_unlock(&br->multicast_lock); } -- cgit v1.2.3 From b5c5c36d367c670b9a93b1029d9b3af8610d9535 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Wed, 20 Aug 2014 23:13:07 +0530 Subject: dn_dev: Use time_before The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. A simplified version of the Coccinelle semantic patch making this change is as follows: @change@ expression E1,E2; @@ ( - (jiffies - E1) < E2 + time_before(jiffies, E1+E2) ) Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/decnet/dn_dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 3b726f31c64c..4400da7739da 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -875,7 +876,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa) { /* First check time since device went up */ - if ((jiffies - dn_db->uptime) < DRDELAY) + if (time_before(jiffies, dn_db->uptime + DRDELAY)) return 0; /* If there is no router, then yes... */ -- cgit v1.2.3 From c72c95a064e55923c5dd050d099e51ac550f29a2 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Wed, 20 Aug 2014 23:14:10 +0530 Subject: ipconfig: Use time_before The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. A simplified version of the Coccinelle semantic patch making this change is as follows: @change@ expression E1,E2; @@ - jiffies - E1 < E2 + time_before(jiffies, E1+E2) Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 5bbef4fdcb43..648fa1490ea7 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -262,7 +262,8 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); - while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + while (time_before(jiffies, start + + msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) { int wait, elapsed; for_each_netdev(&init_net, dev) -- cgit v1.2.3 From 8b1b1eb521004cec2518307c22dba8f4bff1c2bf Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Wed, 20 Aug 2014 23:20:09 +0530 Subject: decnet: Use time_after_eq The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. A simplified version of the Coccinelle semantic patch making this change is as follows: @change@ expression E1,E2; @@ - (jiffies - E1) >= E2 + time_after_eq(jiffies, E1+E2) Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/decnet/dn_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c index d9c150cc59a9..1d330fd43dc7 100644 --- a/net/decnet/dn_timer.c +++ b/net/decnet/dn_timer.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -91,7 +92,7 @@ static void dn_slow_timer(unsigned long arg) * since the last successful transmission. */ if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) { - if ((jiffies - scp->stamp) >= scp->keepalive) + if (time_after_eq(jiffies, scp->stamp + scp->keepalive)) scp->keepalive_fxn(sk); } -- cgit v1.2.3 From c0b802367b05fa6342ab9ef07abdf446b9ba223f Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Wed, 20 Aug 2014 23:24:40 +0530 Subject: af_decnet: Use time_after_eq The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. A simplified version of the Coccinelle semantic patch making this change is as follows: @change@ expression E1,E2,E3; @@ - jiffies - E1 >= (E2*E3) + time_after_eq(jiffies, E1+E2*E3) Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index ae011b46c071..25733d538147 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -127,6 +127,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat #include #include #include +#include #include #include #include @@ -598,7 +599,7 @@ int dn_destroy_timer(struct sock *sk) if (sk->sk_socket) return 0; - if ((jiffies - scp->stamp) >= (HZ * decnet_time_wait)) { + if (time_after_eq(jiffies, scp->stamp + HZ * decnet_time_wait)) { dn_unhash_sock(sk); sock_put(sk); return 1; -- cgit v1.2.3 From d2de875c6d4cbec8a99c880160181a3ed5b9992e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Aug 2014 18:32:09 -0700 Subject: net: use ktime_get_ns() and ktime_get_real_ns() helpers ktime_get_ns() replaces ktime_to_ns(ktime_get()) ktime_get_real_ns() replaces ktime_to_ns(ktime_get_real()) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/codel.h | 2 +- include/net/pkt_sched.h | 2 +- net/core/secure_seq.c | 6 +++--- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/sched/act_police.c | 4 ++-- net/sched/sch_fq.c | 4 ++-- net/sched/sch_htb.c | 6 +++--- net/sched/sch_tbf.c | 6 +++--- 10 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/codel.h b/include/net/codel.h index fe0eab32ce76..aeee28081245 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -66,7 +66,7 @@ typedef s32 codel_tdiff_t; static inline codel_time_t codel_get_time(void) { - u64 ns = ktime_to_ns(ktime_get()); + u64 ns = ktime_get_ns(); return ns >> CODEL_SHIFT; } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index ec030cd76616..8bbe626e9ece 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -50,7 +50,7 @@ typedef long psched_tdiff_t; static inline psched_time_t psched_get_time(void) { - return PSCHED_NS2TICKS(ktime_to_ns(ktime_get())); + return PSCHED_NS2TICKS(ktime_get_ns()); } static inline psched_tdiff_t diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index ba71212f0251..51dd3193a33e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq) * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ - return seq + (ktime_to_ns(ktime_get_real()) >> 6); + return seq + (ktime_get_real_ns() >> 6); } #endif @@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; @@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, md5_transform(hash, secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index de88c4ab5146..0b634e7f1652 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -358,7 +358,7 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); + tstamp->stop = ktime_get_real_ns(); if (nf_ct_is_dying(ct)) goto delete; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 355a5c4ef763..1bd9ed9e62f6 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1737,7 +1737,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } tstamp = nf_conn_tstamp_find(ct); if (tstamp) - tstamp->start = ktime_to_ns(ktime_get_real()); + tstamp->start = ktime_get_real_ns(); err = nf_conntrack_hash_check_insert(ct); if (err < 0) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index f641751dba9d..cf65a1e040dd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -101,7 +101,7 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos) { struct ct_iter_state *st = seq->private; - st->time_now = ktime_to_ns(ktime_get_real()); + st->time_now = ktime_get_real_ns(); rcu_read_lock(); return ct_get_idx(seq, *pos); } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0566e4606a4a..f32bcb094915 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -231,7 +231,7 @@ override: if (ret != ACT_P_CREATED) return ret; - police->tcfp_t_c = ktime_to_ns(ktime_get()); + police->tcfp_t_c = ktime_get_ns(); police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(hinfo); h = tcf_hash(police->tcf_index, POL_TAB_MASK); @@ -279,7 +279,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcfp_result; } - now = ktime_to_ns(ktime_get()); + now = ktime_get_ns(); toks = min_t(s64, now - police->tcfp_t_c, police->tcfp_burst); if (police->peak_present) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ba32c2b005d0..e12f997e1b4c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -416,7 +416,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_to_ns(ktime_get()); + u64 now = ktime_get_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; @@ -787,7 +787,7 @@ nla_put_failure: static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_to_ns(ktime_get()); + u64 now = ktime_get_ns(); struct tc_fq_qd_stats st = { .gc_flows = q->stat_gc_flows, .highprio_packets = q->stat_internal_packets, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9f949abcacef..aea942ce6008 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -895,7 +895,7 @@ ok: if (!sch->q.qlen) goto fin; - q->now = ktime_to_ns(ktime_get()); + q->now = ktime_get_ns(); start_at = jiffies; next_event = q->now + 5LLU * NSEC_PER_SEC; @@ -1225,7 +1225,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; - parent->t_c = ktime_to_ns(ktime_get()); + parent->t_c = ktime_get_ns(); parent->cmode = HTB_CAN_SEND; } @@ -1455,7 +1455,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->tokens = PSCHED_TICKS2NS(hopt->buffer); cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */ - cl->t_c = ktime_to_ns(ktime_get()); + cl->t_c = ktime_get_ns(); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 18ff63433709..0c39b754083b 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -239,7 +239,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); - now = ktime_to_ns(ktime_get()); + now = ktime_get_ns(); toks = min_t(s64, now - q->t_c, q->buffer); if (tbf_peak_present(q)) { @@ -292,7 +292,7 @@ static void tbf_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - q->t_c = ktime_to_ns(ktime_get()); + q->t_c = ktime_get_ns(); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); @@ -431,7 +431,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - q->t_c = ktime_to_ns(ktime_get()); + q->t_c = ktime_get_ns(); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; -- cgit v1.2.3 From 884cf705c7e60bc6ade7ddafcbe943af4dc84604 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Aug 2014 20:30:12 -0700 Subject: net: remove dead code after sk_data_ready change As a followup to commit 676d23690fb ("net: Fix use after free by removing length arg from sk_data_ready callbacks"), we can remove some useless code in sock_queue_rcv_skb() and rxrpc_queue_rcv_skb() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 8 -------- net/rxrpc/ar-input.c | 9 +-------- 2 files changed, 1 insertion(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 2714811afbd8..f7f2352200ad 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 63b21e580de9..481f89f93789 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -45,7 +45,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp; struct rxrpc_sock *rx = call->socket; struct sock *sk; - int skb_len, ret; + int ret; _enter(",,%d,%d", force, terminal); @@ -101,13 +101,6 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, rx->interceptor(sk, call->user_call_ID, skb); spin_unlock_bh(&sk->sk_receive_queue.lock); } else { - - /* Cache the SKB length before we tack it onto the - * receive queue. Once it is added it no longer - * belongs to us and may be freed by other threads of - * control pulling packets from the queue */ - skb_len = skb->len; - _net("post skb %p", skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock_bh(&sk->sk_receive_queue.lock); -- cgit v1.2.3 From 989e04c5bc3ff77d65e1f0d87bf7904dfa30d41c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 22 Aug 2014 14:15:22 -0700 Subject: tcp: improve undo on timeout Upon timeout, undo (via both timestamps/Eifel and DSACKs) was disabled if any retransmits were still in flight. The concern was perhaps that spurious retransmission sent in a previous recovery episode may trigger DSACKs to falsely undo the current recovery. However, this inadvertently misses undo opportunities (using either TCP timestamps or DSACKs) when timeout occurs during a loss episode, i.e. recurring timeouts or timeout during fast recovery. In these cases some retransmissions will be in flight but we should allow undo. Furthermore, we should only reset undo_marker and undo_retrans upon timeout if we are starting a new recovery episode. Finally, when we do reset our undo state, we now do so in a manner similar to tcp_enter_recovery(), so that we require a DSACK for each of the outstsanding retransmissions. This will achieve the original goal by requiring that we receive the same number of DSACKs as retransmissions. This patch increases the undo events by 50% on Google servers. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 26 +++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fa5258f322e7..e567f0dbf282 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -276,7 +276,7 @@ struct tcp_sock { u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ - u32 undo_marker; /* tracking retrans started here. */ + u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u32 total_retrans; /* Total retransmits for entire connection */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a906e0200ff2..aba4926ca095 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1888,21 +1888,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static void tcp_clear_retrans_partial(struct tcp_sock *tp) +void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; - tp->undo_marker = 0; tp->undo_retrans = -1; + tp->fackets_out = 0; + tp->sacked_out = 0; } -void tcp_clear_retrans(struct tcp_sock *tp) +static inline void tcp_init_undo(struct tcp_sock *tp) { - tcp_clear_retrans_partial(tp); - - tp->fackets_out = 0; - tp->sacked_out = 0; + tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ + tp->undo_retrans = tp->retrans_out ? : -1; } /* Enter Loss state. If we detect SACK reneging, forget all SACK information @@ -1925,18 +1925,18 @@ void tcp_enter_loss(struct sock *sk) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); + tcp_init_undo(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans_partial(tp); + tp->retrans_out = 0; + tp->lost_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - tp->undo_marker = tp->snd_una; - skb = tcp_write_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { @@ -1950,9 +1950,6 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2671,8 +2668,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out ? : -1; + tcp_init_undo(tp); if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) -- cgit v1.2.3 From 1dd0bd2b14032037d40a316dd52370f1713fa62b Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:06 -0400 Subject: tipc: introduce new function tipc_msg_create() The function tipc_msg_init() has turned out to be of limited value in many cases. It take too few parameters to be usable for creating a complete message, it makes too many assumptions about what the message should be used for, and it does not allocate any buffer to be returned to the caller. Therefore, we now introduce the new function tipc_msg_create(), which takes all the parameters needed to create a full message, and returns a buffer of the requested size. The new function will be very useful for the changes we will be doing in later commits in this series. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/msg.c | 31 +++++++++++++++++++++++++++++-- net/tipc/msg.h | 4 ++++ 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 9680be6d388a..fdb92e247050 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -56,8 +56,35 @@ void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, msg_set_size(m, hsize); msg_set_prevnode(m, tipc_own_addr); msg_set_type(m, type); - msg_set_orignode(m, tipc_own_addr); - msg_set_destnode(m, destnode); + if (hsize > SHORT_H_SIZE) { + msg_set_orignode(m, tipc_own_addr); + msg_set_destnode(m, destnode); + } +} + +struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, + uint data_sz, u32 dnode, u32 onode, + u32 dport, u32 oport, int errcode) +{ + struct tipc_msg *msg; + struct sk_buff *buf; + + buf = tipc_buf_acquire(hdr_sz + data_sz); + if (unlikely(!buf)) + return NULL; + + msg = buf_msg(buf); + tipc_msg_init(msg, user, type, hdr_sz, dnode); + msg_set_size(msg, hdr_sz + data_sz); + msg_set_prevnode(msg, onode); + msg_set_origport(msg, oport); + msg_set_destport(msg, dport); + msg_set_errcode(msg, errcode); + if (hdr_sz > SHORT_H_SIZE) { + msg_set_orignode(msg, onode); + msg_set_destnode(msg, dnode); + } + return buf; } /* tipc_buf_append(): Append a buffer to the fragment list of another buffer diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 462fa194a6af..3045b2cfbff8 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -732,6 +732,10 @@ int tipc_msg_eval(struct sk_buff *buf, u32 *dnode); void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); +struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, + uint data_sz, u32 dnode, u32 onode, + u32 dport, u32 oport, int errcode); + int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_bundle(struct sk_buff *bbuf, struct sk_buff *buf, u32 mtu); -- cgit v1.2.3 From 50100a5e39461b2a61d6040e73c384766c29975d Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:07 -0400 Subject: tipc: use pseudo message to wake up sockets after link congestion The current link implementation keeps a linked list of blocked ports/ sockets that is populated when there is link congestion. The purpose of this is to let the link know which users to wake up when the congestion abates. This adds unnecessary complexity to the data structure and the code, since it forces us to involve the link each time we want to delete a socket. It also forces us to grab the spinlock port_lock within the scope of node_lock. We want to get rid of this direct dependence, as well as the deadlock hazard resulting from the usage of port_lock. In this commit, we instead let the link keep list of a "wakeup" pseudo messages for use in such situations. Those messages are sent to the pending sockets via the ordinary message reception path, and wake up the socket's owner when they are received. This enables us to get rid of the 'waiting_ports' linked lists in struct tipc_port that manifest this direct reference. As a consequence, we can eliminate another BH entry into the socket, and hence the need to grab port_lock. This is a further step in our effort to remove port_lock altogether. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 7 ++-- net/tipc/core.h | 5 ++- net/tipc/link.c | 119 +++++++++++++++++++++++++----------------------------- net/tipc/link.h | 7 ++-- net/tipc/msg.c | 7 +++- net/tipc/msg.h | 1 + net/tipc/node.c | 12 ++++++ net/tipc/node.h | 4 +- net/tipc/port.c | 2 - net/tipc/port.h | 4 -- net/tipc/socket.c | 15 +++++-- net/tipc/socket.h | 7 +--- 12 files changed, 99 insertions(+), 91 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index dd13bfa09333..9510fb2df566 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -300,8 +300,8 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) tipc_link_push_queue(bcl); bclink_set_last_sent(); } - if (unlikely(released && !list_empty(&bcl->waiting_ports))) - tipc_link_wakeup_ports(bcl, 0); + if (unlikely(released && !skb_queue_empty(&bcl->waiting_sks))) + bclink->node.action_flags |= TIPC_WAKEUP_USERS; exit: tipc_bclink_unlock(); } @@ -840,9 +840,10 @@ int tipc_bclink_init(void) sprintf(bcbearer->media.name, "tipc-broadcast"); spin_lock_init(&bclink->lock); - INIT_LIST_HEAD(&bcl->waiting_ports); + __skb_queue_head_init(&bcl->waiting_sks); bcl->next_out_no = 1; spin_lock_init(&bclink->node.lock); + __skb_queue_head_init(&bclink->node.waiting_sks); bcl->owner = &bclink->node; bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); diff --git a/net/tipc/core.h b/net/tipc/core.h index bb26ed1ee966..d2607a8e2b80 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -187,8 +187,11 @@ static inline void k_term_timer(struct timer_list *timer) struct tipc_skb_cb { void *handle; - bool deferred; struct sk_buff *tail; + bool deferred; + bool wakeup_pending; + u16 chain_sz; + u16 chain_imp; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) diff --git a/net/tipc/link.c b/net/tipc/link.c index fb1485dc6736..6c775a107a02 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -275,7 +275,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, link_init_max_pkt(l_ptr); l_ptr->next_out_no = 1; - INIT_LIST_HEAD(&l_ptr->waiting_ports); + __skb_queue_head_init(&l_ptr->waiting_sks); link_reset_statistics(l_ptr); @@ -322,66 +322,47 @@ void tipc_link_delete_list(unsigned int bearer_id, bool shutting_down) } /** - * link_schedule_port - schedule port for deferred sending - * @l_ptr: pointer to link - * @origport: reference to sending port - * @sz: amount of data to be sent - * - * Schedules port for renewed sending of messages after link congestion - * has abated. + * link_schedule_user - schedule user for wakeup after congestion + * @link: congested link + * @oport: sending port + * @chain_sz: size of buffer chain that was attempted sent + * @imp: importance of message attempted sent + * Create pseudo msg to send back to user when congestion abates */ -static int link_schedule_port(struct tipc_link *l_ptr, u32 origport, u32 sz) +static bool link_schedule_user(struct tipc_link *link, u32 oport, + uint chain_sz, uint imp) { - struct tipc_port *p_ptr; - struct tipc_sock *tsk; + struct sk_buff *buf; - spin_lock_bh(&tipc_port_list_lock); - p_ptr = tipc_port_lock(origport); - if (p_ptr) { - if (!list_empty(&p_ptr->wait_list)) - goto exit; - tsk = tipc_port_to_sock(p_ptr); - tsk->link_cong = 1; - p_ptr->waiting_pkts = 1 + ((sz - 1) / l_ptr->max_pkt); - list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports); - l_ptr->stats.link_congs++; -exit: - tipc_port_unlock(p_ptr); - } - spin_unlock_bh(&tipc_port_list_lock); - return -ELINKCONG; + buf = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, tipc_own_addr, + tipc_own_addr, oport, 0, 0); + if (!buf) + return false; + TIPC_SKB_CB(buf)->chain_sz = chain_sz; + TIPC_SKB_CB(buf)->chain_imp = imp; + __skb_queue_tail(&link->waiting_sks, buf); + link->stats.link_congs++; + return true; } -void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all) +/** + * link_prepare_wakeup - prepare users for wakeup after congestion + * @link: congested link + * Move a number of waiting users, as permitted by available space in + * the send queue, from link wait queue to node wait queue for wakeup + */ +static void link_prepare_wakeup(struct tipc_link *link) { - struct tipc_port *p_ptr; - struct tipc_sock *tsk; - struct tipc_port *temp_p_ptr; - int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size; - - if (all) - win = 100000; - if (win <= 0) - return; - if (!spin_trylock_bh(&tipc_port_list_lock)) - return; - if (link_congested(l_ptr)) - goto exit; - list_for_each_entry_safe(p_ptr, temp_p_ptr, &l_ptr->waiting_ports, - wait_list) { - if (win <= 0) + struct sk_buff_head *wq = &link->waiting_sks; + struct sk_buff *buf; + uint pend_qsz = link->out_queue_size; + + for (buf = skb_peek(wq); buf; buf = skb_peek(wq)) { + if (pend_qsz >= link->queue_limit[TIPC_SKB_CB(buf)->chain_imp]) break; - tsk = tipc_port_to_sock(p_ptr); - list_del_init(&p_ptr->wait_list); - spin_lock_bh(p_ptr->lock); - tsk->link_cong = 0; - tipc_sock_wakeup(tsk); - win -= p_ptr->waiting_pkts; - spin_unlock_bh(p_ptr->lock); + pend_qsz += TIPC_SKB_CB(buf)->chain_sz; + __skb_queue_tail(&link->owner->waiting_sks, __skb_dequeue(wq)); } - -exit: - spin_unlock_bh(&tipc_port_list_lock); } /** @@ -423,6 +404,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) u32 prev_state = l_ptr->state; u32 checkpoint = l_ptr->next_in_no; int was_active_link = tipc_link_is_active(l_ptr); + struct tipc_node *owner = l_ptr->owner; msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); @@ -450,9 +432,10 @@ void tipc_link_reset(struct tipc_link *l_ptr) kfree_skb(l_ptr->proto_msg_queue); l_ptr->proto_msg_queue = NULL; kfree_skb_list(l_ptr->oldest_deferred_in); - if (!list_empty(&l_ptr->waiting_ports)) - tipc_link_wakeup_ports(l_ptr, 1); - + if (!skb_queue_empty(&l_ptr->waiting_sks)) { + skb_queue_splice_init(&l_ptr->waiting_sks, &owner->waiting_sks); + owner->action_flags |= TIPC_WAKEUP_USERS; + } l_ptr->retransm_queue_head = 0; l_ptr->retransm_queue_size = 0; l_ptr->last_out = NULL; @@ -688,19 +671,23 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) static int tipc_link_cong(struct tipc_link *link, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); - uint psz = msg_size(msg); uint imp = tipc_msg_tot_importance(msg); u32 oport = msg_tot_origport(msg); - if (likely(imp <= TIPC_CRITICAL_IMPORTANCE)) { - if (!msg_errcode(msg) && !msg_reroute_cnt(msg)) { - link_schedule_port(link, oport, psz); - return -ELINKCONG; - } - } else { + if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); tipc_link_reset(link); + goto drop; } + if (unlikely(msg_errcode(msg))) + goto drop; + if (unlikely(msg_reroute_cnt(msg))) + goto drop; + if (TIPC_SKB_CB(buf)->wakeup_pending) + return -ELINKCONG; + if (link_schedule_user(link, oport, TIPC_SKB_CB(buf)->chain_sz, imp)) + return -ELINKCONG; +drop: kfree_skb_list(buf); return -EHOSTUNREACH; } @@ -1202,8 +1189,10 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr) if (unlikely(l_ptr->next_out)) tipc_link_push_queue(l_ptr); - if (unlikely(!list_empty(&l_ptr->waiting_ports))) - tipc_link_wakeup_ports(l_ptr, 0); + if (released && !skb_queue_empty(&l_ptr->waiting_sks)) { + link_prepare_wakeup(l_ptr); + l_ptr->owner->action_flags |= TIPC_WAKEUP_USERS; + } /* Process the incoming packet */ if (unlikely(!link_working_working(l_ptr))) { diff --git a/net/tipc/link.h b/net/tipc/link.h index 782983ccd323..b567a3427fda 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -1,7 +1,7 @@ /* * net/tipc/link.h: Include file for TIPC link code * - * Copyright (c) 1995-2006, 2013, Ericsson AB + * Copyright (c) 1995-2006, 2013-2014, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -133,7 +133,7 @@ struct tipc_stats { * @retransm_queue_size: number of messages to retransmit * @retransm_queue_head: sequence number of first message to retransmit * @next_out: ptr to first unsent outbound message in queue - * @waiting_ports: linked list of ports waiting for link congestion to abate + * @waiting_sks: linked list of sockets waiting for link congestion to abate * @long_msg_seq_no: next identifier to use for outbound fragmented messages * @reasm_buf: head of partially reassembled inbound message fragments * @stats: collects statistics regarding link activity @@ -194,7 +194,7 @@ struct tipc_link { u32 retransm_queue_size; u32 retransm_queue_head; struct sk_buff *next_out; - struct list_head waiting_ports; + struct sk_buff_head waiting_sks; /* Fragmentation/reassembly */ u32 long_msg_seq_no; @@ -235,7 +235,6 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, void tipc_link_push_queue(struct tipc_link *l_ptr); u32 tipc_link_defer_pkt(struct sk_buff **head, struct sk_buff **tail, struct sk_buff *buf); -void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all); void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *start, u32 retransmits); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index fdb92e247050..74745a47d72a 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -182,7 +182,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, struct sk_buff *buf, *prev; char *pktpos; int rc; - + uint chain_sz = 0; msg_set_size(mhdr, msz); /* No fragmentation needed? */ @@ -193,6 +193,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, return -ENOMEM; skb_copy_to_linear_data(buf, mhdr, mhsz); pktpos = buf->data + mhsz; + TIPC_SKB_CB(buf)->chain_sz = 1; if (!dsz || !memcpy_fromiovecend(pktpos, iov, offset, dsz)) return dsz; rc = -EFAULT; @@ -209,6 +210,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, *chain = buf = tipc_buf_acquire(pktmax); if (!buf) return -ENOMEM; + chain_sz = 1; pktpos = buf->data; skb_copy_to_linear_data(buf, &pkthdr, INT_H_SIZE); pktpos += INT_H_SIZE; @@ -242,6 +244,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, rc = -ENOMEM; goto error; } + chain_sz++; prev->next = buf; msg_set_type(&pkthdr, FRAGMENT); msg_set_size(&pkthdr, pktsz); @@ -251,7 +254,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, pktrem = pktsz - INT_H_SIZE; } while (1); - + TIPC_SKB_CB(*chain)->chain_sz = chain_sz; msg_set_type(buf_msg(buf), LAST_FRAGMENT); return dsz; error: diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 3045b2cfbff8..0ea7b695ac4d 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -442,6 +442,7 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 +#define SOCK_WAKEUP 14 /* pseudo user */ /* * Connection management protocol message types diff --git a/net/tipc/node.c b/net/tipc/node.c index f7069299943f..6ea2c15cfc88 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -38,6 +38,7 @@ #include "config.h" #include "node.h" #include "name_distr.h" +#include "socket.h" #define NODE_HTABLE_SIZE 512 @@ -100,6 +101,7 @@ struct tipc_node *tipc_node_create(u32 addr) INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->nsub); + __skb_queue_head_init(&n_ptr->waiting_sks); hlist_add_head_rcu(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]); @@ -474,6 +476,7 @@ int tipc_node_get_linkname(u32 bearer_id, u32 addr, char *linkname, size_t len) void tipc_node_unlock(struct tipc_node *node) { LIST_HEAD(nsub_list); + struct sk_buff_head waiting_sks; u32 addr = 0; if (likely(!node->action_flags)) { @@ -481,6 +484,11 @@ void tipc_node_unlock(struct tipc_node *node) return; } + __skb_queue_head_init(&waiting_sks); + if (node->action_flags & TIPC_WAKEUP_USERS) { + skb_queue_splice_init(&node->waiting_sks, &waiting_sks); + node->action_flags &= ~TIPC_WAKEUP_USERS; + } if (node->action_flags & TIPC_NOTIFY_NODE_DOWN) { list_replace_init(&node->nsub, &nsub_list); node->action_flags &= ~TIPC_NOTIFY_NODE_DOWN; @@ -491,8 +499,12 @@ void tipc_node_unlock(struct tipc_node *node) } spin_unlock_bh(&node->lock); + while (!skb_queue_empty(&waiting_sks)) + tipc_sk_rcv(__skb_dequeue(&waiting_sks)); + if (!list_empty(&nsub_list)) tipc_nodesub_notify(&nsub_list); + if (addr) tipc_named_node_up(addr); } diff --git a/net/tipc/node.h b/net/tipc/node.h index b61716a8218e..2ebf9e8b50fd 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -58,7 +58,8 @@ enum { TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1), TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), TIPC_NOTIFY_NODE_DOWN = (1 << 3), - TIPC_NOTIFY_NODE_UP = (1 << 4) + TIPC_NOTIFY_NODE_UP = (1 << 4), + TIPC_WAKEUP_USERS = (1 << 5) }; /** @@ -115,6 +116,7 @@ struct tipc_node { int working_links; u32 signature; struct list_head nsub; + struct sk_buff_head waiting_sks; struct rcu_head rcu; }; diff --git a/net/tipc/port.c b/net/tipc/port.c index 7e096a5e7701..b58a777a4399 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -92,7 +92,6 @@ u32 tipc_port_init(struct tipc_port *p_ptr, p_ptr->max_pkt = MAX_PKT_DEFAULT; p_ptr->ref = ref; - INIT_LIST_HEAD(&p_ptr->wait_list); INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref); INIT_LIST_HEAD(&p_ptr->publications); @@ -134,7 +133,6 @@ void tipc_port_destroy(struct tipc_port *p_ptr) } spin_lock_bh(&tipc_port_list_lock); list_del(&p_ptr->port_list); - list_del(&p_ptr->wait_list); spin_unlock_bh(&tipc_port_list_lock); k_term_timer(&p_ptr->timer); } diff --git a/net/tipc/port.h b/net/tipc/port.h index 3087da39ee47..6cdc7de8c9b8 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -58,8 +58,6 @@ * @ref: unique reference to port in TIPC object registry * @phdr: preformatted message header used when sending messages * @port_list: adjacent ports in TIPC's global list of ports - * @wait_list: adjacent ports in list of ports waiting on link congestion - * @waiting_pkts: * @publications: list of publications for port * @pub_count: total # of publications port has made during its lifetime * @probing_state: @@ -77,8 +75,6 @@ struct tipc_port { u32 ref; struct tipc_msg phdr; struct list_head port_list; - struct list_head wait_list; - u32 waiting_pkts; struct list_head publications; u32 pub_count; u32 probing_state; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ff8c8118d56e..a8be4d2001f7 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -579,6 +579,7 @@ new_mtu: goto new_mtu; if (rc != -ELINKCONG) break; + tipc_sk(sk)->link_cong = 1; rc = tipc_wait_for_sndmsg(sock, &timeo); if (rc) kfree_skb_list(buf); @@ -651,7 +652,7 @@ static int tipc_sk_proto_rcv(struct tipc_sock *tsk, u32 *dnode, conn_cong = tipc_sk_conn_cong(tsk); tsk->sent_unacked -= msg_msgcnt(msg); if (conn_cong) - tipc_sock_wakeup(tsk); + tsk->sk.sk_write_space(&tsk->sk); } else if (msg_type(msg) == CONN_PROBE) { if (!tipc_msg_reverse(buf, dnode, TIPC_OK)) return TIPC_OK; @@ -826,6 +827,7 @@ new_mtu: goto exit; do { + TIPC_SKB_CB(buf)->wakeup_pending = tsk->link_cong; rc = tipc_link_xmit(buf, dnode, tsk->port.ref); if (likely(rc >= 0)) { if (sock->state != SS_READY) @@ -835,10 +837,9 @@ new_mtu: } if (rc == -EMSGSIZE) goto new_mtu; - if (rc != -ELINKCONG) break; - + tsk->link_cong = 1; rc = tipc_wait_for_sndmsg(sock, &timeo); if (rc) kfree_skb_list(buf); @@ -953,6 +954,7 @@ next: } if (rc != -ELINKCONG) break; + tsk->link_cong = 1; } rc = tipc_wait_for_sndpkt(sock, &timeo); if (rc) @@ -1518,6 +1520,13 @@ static int filter_rcv(struct sock *sk, struct sk_buff *buf) if (unlikely(msg_user(msg) == CONN_MANAGER)) return tipc_sk_proto_rcv(tsk, &onode, buf); + if (unlikely(msg_user(msg) == SOCK_WAKEUP)) { + kfree_skb(buf); + tsk->link_cong = 0; + sk->sk_write_space(sk); + return TIPC_OK; + } + /* Reject message if it is wrong sort of message for socket */ if (msg_type(msg) > TIPC_DIRECT_MSG) return -TIPC_ERR_NO_PORT; diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 43b75b3ceced..1405633362f5 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -58,7 +58,7 @@ struct tipc_sock { struct tipc_port port; unsigned int conn_timeout; atomic_t dupl_rcvcnt; - int link_cong; + bool link_cong; uint sent_unacked; uint rcv_unacked; }; @@ -73,11 +73,6 @@ static inline struct tipc_sock *tipc_port_to_sock(const struct tipc_port *port) return container_of(port, struct tipc_sock, port); } -static inline void tipc_sock_wakeup(struct tipc_sock *tsk) -{ - tsk->sk.sk_write_space(&tsk->sk); -} - static inline int tipc_sk_conn_cong(struct tipc_sock *tsk) { return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; -- cgit v1.2.3 From 02be61a981fb5ca5f1526323336198ee92cadf95 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:08 -0400 Subject: tipc: use message to abort connections when losing contact to node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current implementation, each 'struct tipc_node' instance keeps a linked list of those ports/sockets that are connected to the node represented by that struct. The purpose of this is to let the node object know which sockets to alert when it loses contact with its peer node, i.e., which sockets need to have their connections aborted. This entails an unwanted direct reference from the node structure back to the port/socket structure, and a need to grab port_lock when we have to make an upcall to the port. We want to get rid of this unecessary BH entry point into the socket, and also eliminate its use of port_lock. In this commit, we instead let the node struct keep list of "connected socket" structs, which each represents a connected socket, but is allocated independently by the node at the moment of connection. If the node loses contact with its peer node, the list is traversed, and a "connection abort" message is created for each entry in the list. The message is sent to it respective connected socket using the ordinary data path, and the receiving socket aborts its connections upon reception of the message. This enables us to get rid of the direct reference from 'struct node' to ´struct port', and another unwanted BH access point to the latter. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/tipc/node.h | 3 +++ net/tipc/port.c | 29 +++------------------ 3 files changed, 85 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 6ea2c15cfc88..17e6378c4dfe 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -51,6 +51,13 @@ static u32 tipc_num_nodes; static u32 tipc_num_links; static DEFINE_SPINLOCK(node_list_lock); +struct tipc_sock_conn { + u32 port; + u32 peer_port; + u32 peer_node; + struct list_head list; +}; + /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -101,6 +108,7 @@ struct tipc_node *tipc_node_create(u32 addr) INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->nsub); + INIT_LIST_HEAD(&n_ptr->conn_sks); __skb_queue_head_init(&n_ptr->waiting_sks); hlist_add_head_rcu(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]); @@ -138,6 +146,71 @@ void tipc_node_stop(void) spin_unlock_bh(&node_list_lock); } +int tipc_node_add_conn(u32 dnode, u32 port, u32 peer_port) +{ + struct tipc_node *node; + struct tipc_sock_conn *conn; + + if (in_own_node(dnode)) + return 0; + + node = tipc_node_find(dnode); + if (!node) { + pr_warn("Connecting sock to node 0x%x failed\n", dnode); + return -EHOSTUNREACH; + } + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (!conn) + return -EHOSTUNREACH; + conn->peer_node = dnode; + conn->port = port; + conn->peer_port = peer_port; + + tipc_node_lock(node); + list_add_tail(&conn->list, &node->conn_sks); + tipc_node_unlock(node); + return 0; +} + +void tipc_node_remove_conn(u32 dnode, u32 port) +{ + struct tipc_node *node; + struct tipc_sock_conn *conn, *safe; + + if (in_own_node(dnode)) + return; + + node = tipc_node_find(dnode); + if (!node) + return; + + tipc_node_lock(node); + list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { + if (port != conn->port) + continue; + list_del(&conn->list); + kfree(conn); + } + tipc_node_unlock(node); +} + +void tipc_node_abort_sock_conns(struct list_head *conns) +{ + struct tipc_sock_conn *conn, *safe; + struct sk_buff *buf; + + list_for_each_entry_safe(conn, safe, conns, list) { + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, tipc_own_addr, + conn->peer_node, conn->port, + conn->peer_port, TIPC_ERR_NO_NODE); + if (likely(buf)) + tipc_sk_rcv(buf); + list_del(&conn->list); + kfree(conn); + } +} + /** * tipc_node_link_up - handle addition of link * @@ -476,6 +549,7 @@ int tipc_node_get_linkname(u32 bearer_id, u32 addr, char *linkname, size_t len) void tipc_node_unlock(struct tipc_node *node) { LIST_HEAD(nsub_list); + LIST_HEAD(conn_sks); struct sk_buff_head waiting_sks; u32 addr = 0; @@ -491,6 +565,7 @@ void tipc_node_unlock(struct tipc_node *node) } if (node->action_flags & TIPC_NOTIFY_NODE_DOWN) { list_replace_init(&node->nsub, &nsub_list); + list_replace_init(&node->conn_sks, &conn_sks); node->action_flags &= ~TIPC_NOTIFY_NODE_DOWN; } if (node->action_flags & TIPC_NOTIFY_NODE_UP) { @@ -502,6 +577,9 @@ void tipc_node_unlock(struct tipc_node *node) while (!skb_queue_empty(&waiting_sks)) tipc_sk_rcv(__skb_dequeue(&waiting_sks)); + if (!list_empty(&conn_sks)) + tipc_node_abort_sock_conns(&conn_sks); + if (!list_empty(&nsub_list)) tipc_nodesub_notify(&nsub_list); diff --git a/net/tipc/node.h b/net/tipc/node.h index 2ebf9e8b50fd..522d6f3157b3 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -117,6 +117,7 @@ struct tipc_node { u32 signature; struct list_head nsub; struct sk_buff_head waiting_sks; + struct list_head conn_sks; struct rcu_head rcu; }; @@ -135,6 +136,8 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space); int tipc_node_get_linkname(u32 bearer_id, u32 node, char *linkname, size_t len); void tipc_node_unlock(struct tipc_node *node); +int tipc_node_add_conn(u32 dnode, u32 port, u32 peer_port); +void tipc_node_remove_conn(u32 dnode, u32 port); static inline void tipc_node_lock(struct tipc_node *node) { diff --git a/net/tipc/port.c b/net/tipc/port.c index b58a777a4399..edbd83d223c5 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -48,7 +48,6 @@ DEFINE_SPINLOCK(tipc_port_list_lock); static LIST_HEAD(ports); -static void port_handle_node_down(unsigned long ref); static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err); static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err); static void port_timeout(unsigned long ref); @@ -126,10 +125,10 @@ void tipc_port_destroy(struct tipc_port *p_ptr) k_cancel_timer(&p_ptr->timer); if (p_ptr->connected) { buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT); - tipc_nodesub_unsubscribe(&p_ptr->subscription); msg = buf_msg(buf); peer = msg_destnode(msg); tipc_link_xmit(buf, peer, msg_link_selector(msg)); + tipc_node_remove_conn(peer, p_ptr->ref); } spin_lock_bh(&tipc_port_list_lock); list_del(&p_ptr->port_list); @@ -188,22 +187,6 @@ static void port_timeout(unsigned long ref) tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); } - -static void port_handle_node_down(unsigned long ref) -{ - struct tipc_port *p_ptr = tipc_port_lock(ref); - struct sk_buff *buf = NULL; - struct tipc_msg *msg = NULL; - - if (!p_ptr) - return; - buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE); - tipc_port_unlock(p_ptr); - msg = buf_msg(buf); - tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); -} - - static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 err) { struct sk_buff *buf = port_build_peer_abort_msg(p_ptr, err); @@ -217,7 +200,6 @@ static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 er return buf; } - static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 err) { struct sk_buff *buf; @@ -447,11 +429,8 @@ int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, p_ptr->probing_state = TIPC_CONN_OK; p_ptr->connected = 1; k_start_timer(&p_ptr->timer, p_ptr->probing_interval); - - tipc_nodesub_subscribe(&p_ptr->subscription, peer->node, - (void *)(unsigned long)ref, - (net_ev_handler)port_handle_node_down); - res = 0; + res = tipc_node_add_conn(tipc_port_peernode(p_ptr), p_ptr->ref, + tipc_port_peerport(p_ptr)); exit: p_ptr->max_pkt = tipc_node_get_mtu(peer->node, ref); return res; @@ -467,7 +446,7 @@ int __tipc_port_disconnect(struct tipc_port *tp_ptr) if (tp_ptr->connected) { tp_ptr->connected = 0; /* let timer expire on it's own to avoid deadlock! */ - tipc_nodesub_unsubscribe(&tp_ptr->subscription); + tipc_node_remove_conn(tipc_port_peernode(tp_ptr), tp_ptr->ref); return 0; } -- cgit v1.2.3 From 5728901581139e68e6cf53b36590f64829c37453 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:09 -0400 Subject: tipc: clean up socket timer function The last remaining BH upcall to the socket, apart for the message reception function tipc_sk_rcv(), is the timer function. We prefer to let this function continue executing in BH, since it only does read-acces to semi-permanent data, but we make three changes to it: 1) We introduce a bh_lock_sock()/bh_unlock_sock() inside the scope of port_lock. This is a preparation for replacing port_lock with bh_lock_sock() at the locations where it is still used. 2) We move the function from port.c to socket.c, as a further step of eliminating the port code level altogether. 3) We let it make use of the newly introduced tipc_msg_create() function. This enables us to get rid of three context specific functions (port_create_self_abort_msg() etc.) in port.c Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.c | 126 ++++++++++-------------------------------------------- net/tipc/socket.c | 46 ++++++++++++++++++++ 2 files changed, 69 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/tipc/port.c b/net/tipc/port.c index edbd83d223c5..6de79f26981e 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -48,9 +48,6 @@ DEFINE_SPINLOCK(tipc_port_list_lock); static LIST_HEAD(ports); -static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err); -static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err); -static void port_timeout(unsigned long ref); /** * tipc_port_peer_msg - verify message was sent by connected port's peer @@ -92,7 +89,6 @@ u32 tipc_port_init(struct tipc_port *p_ptr, p_ptr->max_pkt = MAX_PKT_DEFAULT; p_ptr->ref = ref; INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); - k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref); INIT_LIST_HEAD(&p_ptr->publications); INIT_LIST_HEAD(&p_ptr->port_list); @@ -114,7 +110,7 @@ void tipc_port_destroy(struct tipc_port *p_ptr) { struct sk_buff *buf = NULL; struct tipc_msg *msg = NULL; - u32 peer; + u32 peer_node; tipc_withdraw(p_ptr, 0, NULL); @@ -124,11 +120,16 @@ void tipc_port_destroy(struct tipc_port *p_ptr) k_cancel_timer(&p_ptr->timer); if (p_ptr->connected) { - buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT); - msg = buf_msg(buf); - peer = msg_destnode(msg); - tipc_link_xmit(buf, peer, msg_link_selector(msg)); - tipc_node_remove_conn(peer, p_ptr->ref); + peer_node = tipc_port_peernode(p_ptr); + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, peer_node, + tipc_own_addr, tipc_port_peerport(p_ptr), + p_ptr->ref, TIPC_ERR_NO_PORT); + if (buf) { + msg = buf_msg(buf); + tipc_link_xmit(buf, peer_node, msg_link_selector(msg)); + } + tipc_node_remove_conn(peer_node, p_ptr->ref); } spin_lock_bh(&tipc_port_list_lock); list_del(&p_ptr->port_list); @@ -136,94 +137,6 @@ void tipc_port_destroy(struct tipc_port *p_ptr) k_term_timer(&p_ptr->timer); } -/* - * port_build_proto_msg(): create connection protocol message for port - * - * On entry the port must be locked and connected. - */ -static struct sk_buff *port_build_proto_msg(struct tipc_port *p_ptr, - u32 type, u32 ack) -{ - struct sk_buff *buf; - struct tipc_msg *msg; - - buf = tipc_buf_acquire(INT_H_SIZE); - if (buf) { - msg = buf_msg(buf); - tipc_msg_init(msg, CONN_MANAGER, type, INT_H_SIZE, - tipc_port_peernode(p_ptr)); - msg_set_destport(msg, tipc_port_peerport(p_ptr)); - msg_set_origport(msg, p_ptr->ref); - msg_set_msgcnt(msg, ack); - buf->next = NULL; - } - return buf; -} - -static void port_timeout(unsigned long ref) -{ - struct tipc_port *p_ptr = tipc_port_lock(ref); - struct sk_buff *buf = NULL; - struct tipc_msg *msg = NULL; - - if (!p_ptr) - return; - - if (!p_ptr->connected) { - tipc_port_unlock(p_ptr); - return; - } - - /* Last probe answered ? */ - if (p_ptr->probing_state == TIPC_CONN_PROBING) { - buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT); - } else { - buf = port_build_proto_msg(p_ptr, CONN_PROBE, 0); - p_ptr->probing_state = TIPC_CONN_PROBING; - k_start_timer(&p_ptr->timer, p_ptr->probing_interval); - } - tipc_port_unlock(p_ptr); - msg = buf_msg(buf); - tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); -} - -static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 err) -{ - struct sk_buff *buf = port_build_peer_abort_msg(p_ptr, err); - - if (buf) { - struct tipc_msg *msg = buf_msg(buf); - msg_swap_words(msg, 4, 5); - msg_swap_words(msg, 6, 7); - buf->next = NULL; - } - return buf; -} - -static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 err) -{ - struct sk_buff *buf; - struct tipc_msg *msg; - u32 imp; - - if (!p_ptr->connected) - return NULL; - - buf = tipc_buf_acquire(BASIC_H_SIZE); - if (buf) { - msg = buf_msg(buf); - memcpy(msg, &p_ptr->phdr, BASIC_H_SIZE); - msg_set_hdr_sz(msg, BASIC_H_SIZE); - msg_set_size(msg, BASIC_H_SIZE); - imp = msg_importance(msg); - if (imp < TIPC_CRITICAL_IMPORTANCE) - msg_set_importance(msg, ++imp); - msg_set_errcode(msg, err); - buf->next = NULL; - } - return buf; -} - static int port_print(struct tipc_port *p_ptr, char *buf, int len, int full_id) { struct publication *publ; @@ -321,12 +234,15 @@ void tipc_acknowledge(u32 ref, u32 ack) if (!p_ptr) return; if (p_ptr->connected) - buf = port_build_proto_msg(p_ptr, CONN_ACK, ack); - + buf = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, + 0, tipc_port_peernode(p_ptr), + tipc_own_addr, tipc_port_peerport(p_ptr), + p_ptr->ref, TIPC_OK); tipc_port_unlock(p_ptr); if (!buf) return; msg = buf_msg(buf); + msg_set_msgcnt(msg, ack); tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); } @@ -478,14 +394,18 @@ int tipc_port_shutdown(u32 ref) struct tipc_msg *msg; struct tipc_port *p_ptr; struct sk_buff *buf = NULL; + u32 peer_node; p_ptr = tipc_port_lock(ref); if (!p_ptr) return -EINVAL; - - buf = port_build_peer_abort_msg(p_ptr, TIPC_CONN_SHUTDOWN); + peer_node = tipc_port_peernode(p_ptr); + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, peer_node, + tipc_own_addr, tipc_port_peerport(p_ptr), + p_ptr->ref, TIPC_CONN_SHUTDOWN); tipc_port_unlock(p_ptr); msg = buf_msg(buf); - tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); + tipc_link_xmit(buf, peer_node, msg_link_selector(msg)); return tipc_port_disconnect(ref); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a8be4d2001f7..5f8376e8da2a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -53,6 +53,7 @@ static void tipc_write_space(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); +static void tipc_sk_timeout(unsigned long ref); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -202,6 +203,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sock->state = state; sock_init_data(sock, sk); + k_init_timer(&port->timer, (Handler)tipc_sk_timeout, ref); sk->sk_backlog_rcv = tipc_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; @@ -1946,6 +1948,50 @@ restart: return res; } +static void tipc_sk_timeout(unsigned long ref) +{ + struct tipc_port *port = tipc_port_lock(ref); + struct tipc_sock *tsk; + struct sock *sk; + struct sk_buff *buf = NULL; + struct tipc_msg *msg = NULL; + u32 peer_port, peer_node; + + if (!port) + return; + + if (!port->connected) { + tipc_port_unlock(port); + return; + } + tsk = tipc_port_to_sock(port); + sk = &tsk->sk; + bh_lock_sock(sk); + peer_port = tipc_port_peerport(port); + peer_node = tipc_port_peernode(port); + + if (port->probing_state == TIPC_CONN_PROBING) { + /* Previous probe not answered -> self abort */ + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, tipc_own_addr, + peer_node, ref, peer_port, + TIPC_ERR_NO_PORT); + } else { + buf = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, + 0, peer_node, tipc_own_addr, + peer_port, ref, TIPC_OK); + port->probing_state = TIPC_CONN_PROBING; + k_start_timer(&port->timer, port->probing_interval); + } + bh_unlock_sock(sk); + tipc_port_unlock(port); + if (!buf) + return; + + msg = buf_msg(buf); + tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); +} + /** * tipc_setsockopt - set socket option * @sock: socket structure -- cgit v1.2.3 From 80e44c22255468337b891da2348cab68cb62766f Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:10 -0400 Subject: tipc: eliminate function tipc_port_shutdown() tipc_port_shutdown() is a remnant from the now obsolete native interface. As such it grabs port_lock in order to protect itself from concurrent BH processing. However, after the recent changes to the port/socket upcalls, sockets are now basically single-threaded, and all execution, except the read-only tipc_sk_timer(), is executing within the protection of lock_sock(). So the use of port_lock is not needed here. In this commit we eliminate the whole function, and merge it into its only caller, tipc_shutdown(). Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.c | 24 ------------------------ net/tipc/port.h | 2 -- net/tipc/socket.c | 15 +++++++++++---- 3 files changed, 11 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/tipc/port.c b/net/tipc/port.c index 6de79f26981e..3ad092b7fb7d 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -385,27 +385,3 @@ int tipc_port_disconnect(u32 ref) tipc_port_unlock(p_ptr); return res; } - -/* - * tipc_port_shutdown(): Send a SHUTDOWN msg to peer and disconnect - */ -int tipc_port_shutdown(u32 ref) -{ - struct tipc_msg *msg; - struct tipc_port *p_ptr; - struct sk_buff *buf = NULL; - u32 peer_node; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - peer_node = tipc_port_peernode(p_ptr); - buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, - SHORT_H_SIZE, 0, peer_node, - tipc_own_addr, tipc_port_peerport(p_ptr), - p_ptr->ref, TIPC_CONN_SHUTDOWN); - tipc_port_unlock(p_ptr); - msg = buf_msg(buf); - tipc_link_xmit(buf, peer_node, msg_link_selector(msg)); - return tipc_port_disconnect(ref); -} diff --git a/net/tipc/port.h b/net/tipc/port.h index 6cdc7de8c9b8..f5762f940e33 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -106,8 +106,6 @@ int tipc_port_connect(u32 portref, struct tipc_portid const *port); int tipc_port_disconnect(u32 portref); -int tipc_port_shutdown(u32 ref); - /* * The following routines require that the port be locked on entry */ diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5f8376e8da2a..f202d4790ce3 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1899,7 +1899,7 @@ static int tipc_shutdown(struct socket *sock, int how) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_port *port = &tsk->port; struct sk_buff *buf; - u32 peer; + u32 dnode; int res; if (how != SHUT_RDWR) @@ -1920,10 +1920,17 @@ restart: goto restart; } tipc_port_disconnect(port->ref); - if (tipc_msg_reverse(buf, &peer, TIPC_CONN_SHUTDOWN)) - tipc_link_xmit(buf, peer, 0); + if (tipc_msg_reverse(buf, &dnode, TIPC_CONN_SHUTDOWN)) + tipc_link_xmit(buf, dnode, port->ref); } else { - tipc_port_shutdown(port->ref); + dnode = tipc_port_peernode(port); + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, + TIPC_CONN_MSG, SHORT_H_SIZE, + 0, dnode, tipc_own_addr, + tipc_port_peerport(port), + port->ref, TIPC_CONN_SHUTDOWN); + tipc_link_xmit(buf, dnode, port->ref); + __tipc_port_disconnect(port); } sock->state = SS_DISCONNECTING; -- cgit v1.2.3 From dadebc00299a19dc4639ba7192db937e31b81eb2 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:11 -0400 Subject: tipc: eliminate port_connect()/port_disconnect() functions tipc_port_connect()/tipc_port_disconnect() are remnants of the obsolete native API. Their only task is to grab port_lock and call the functions __tipc_port_connect()/__tipc_port_disconnect() respectively, which will perform the actual state change. Since socket/port exection now is single-threaded the use of port_lock is not needed any more, so we can safely replace the two functions with their lock-free counterparts. In this commit, we remove the two functions. Furthermore, the contents of __tipc_port_disconnect() is so trivial that we choose to eliminate that function too, expanding its functionality into tipc_shutdown(). __tipc_port_connect() is simplified, moved to socket.c, and given the more correct name tipc_sk_finish_conn(). Finally, we eliminate the function auto_connect(), and expand its contents into filter_connect(). Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.c | 84 ------------------------------------------------------- net/tipc/port.h | 10 ------- net/tipc/socket.c | 75 ++++++++++++++++++++++++------------------------- 3 files changed, 37 insertions(+), 132 deletions(-) (limited to 'net') diff --git a/net/tipc/port.c b/net/tipc/port.c index 3ad092b7fb7d..2f96719a3514 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -40,9 +40,6 @@ #include "name_table.h" #include "socket.h" -/* Connection management: */ -#define PROBING_INTERVAL 3600000 /* [ms] => 1 h */ - #define MAX_REJECT_SIZE 1024 DEFINE_SPINLOCK(tipc_port_list_lock); @@ -304,84 +301,3 @@ int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, p_ptr->published = 0; return res; } - -int tipc_port_connect(u32 ref, struct tipc_portid const *peer) -{ - struct tipc_port *p_ptr; - int res; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - res = __tipc_port_connect(ref, p_ptr, peer); - tipc_port_unlock(p_ptr); - return res; -} - -/* - * __tipc_port_connect - connect to a remote peer - * - * Port must be locked. - */ -int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, - struct tipc_portid const *peer) -{ - struct tipc_msg *msg; - int res = -EINVAL; - - if (p_ptr->published || p_ptr->connected) - goto exit; - if (!peer->ref) - goto exit; - - msg = &p_ptr->phdr; - msg_set_destnode(msg, peer->node); - msg_set_destport(msg, peer->ref); - msg_set_type(msg, TIPC_CONN_MSG); - msg_set_lookup_scope(msg, 0); - msg_set_hdr_sz(msg, SHORT_H_SIZE); - - p_ptr->probing_interval = PROBING_INTERVAL; - p_ptr->probing_state = TIPC_CONN_OK; - p_ptr->connected = 1; - k_start_timer(&p_ptr->timer, p_ptr->probing_interval); - res = tipc_node_add_conn(tipc_port_peernode(p_ptr), p_ptr->ref, - tipc_port_peerport(p_ptr)); -exit: - p_ptr->max_pkt = tipc_node_get_mtu(peer->node, ref); - return res; -} - -/* - * __tipc_disconnect - disconnect port from peer - * - * Port must be locked. - */ -int __tipc_port_disconnect(struct tipc_port *tp_ptr) -{ - if (tp_ptr->connected) { - tp_ptr->connected = 0; - /* let timer expire on it's own to avoid deadlock! */ - tipc_node_remove_conn(tipc_port_peernode(tp_ptr), tp_ptr->ref); - return 0; - } - - return -ENOTCONN; -} - -/* - * tipc_port_disconnect(): Disconnect port form peer. - * This is a node local operation. - */ -int tipc_port_disconnect(u32 ref) -{ - struct tipc_port *p_ptr; - int res; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - res = __tipc_port_disconnect(p_ptr); - tipc_port_unlock(p_ptr); - return res; -} diff --git a/net/tipc/port.h b/net/tipc/port.h index f5762f940e33..b356cb8340d9 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -102,16 +102,6 @@ int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, struct tipc_name_seq const *name_seq); -int tipc_port_connect(u32 portref, struct tipc_portid const *port); - -int tipc_port_disconnect(u32 portref); - -/* - * The following routines require that the port be locked on entry - */ -int __tipc_port_disconnect(struct tipc_port *tp_ptr); -int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, - struct tipc_portid const *peer); int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); struct sk_buff *tipc_port_get_ports(void); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f202d4790ce3..a65105818fe5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -45,6 +45,7 @@ #define SS_READY -2 /* socket is connectionless */ #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define CONN_PROBING_INTERVAL 3600000 /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -339,7 +340,9 @@ static int tipc_release(struct socket *sock) if ((sock->state == SS_CONNECTING) || (sock->state == SS_CONNECTED)) { sock->state = SS_DISCONNECTING; - tipc_port_disconnect(port->ref); + port->connected = 0; + tipc_node_remove_conn(tipc_port_peernode(port), + port->ref); } if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) tipc_link_xmit(buf, dnode, 0); @@ -988,29 +991,25 @@ static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, return tipc_send_stream(iocb, sock, m, dsz); } -/** - * auto_connect - complete connection setup to a remote port - * @tsk: tipc socket structure - * @msg: peer's response message - * - * Returns 0 on success, errno otherwise +/* tipc_sk_finish_conn - complete the setup of a connection */ -static int auto_connect(struct tipc_sock *tsk, struct tipc_msg *msg) +static void tipc_sk_finish_conn(struct tipc_port *port, u32 peer_port, + u32 peer_node) { - struct tipc_port *port = &tsk->port; - struct socket *sock = tsk->sk.sk_socket; - struct tipc_portid peer; - - peer.ref = msg_origport(msg); - peer.node = msg_orignode(msg); + struct tipc_msg *msg = &port->phdr; - __tipc_port_connect(port->ref, port, &peer); + msg_set_destnode(msg, peer_node); + msg_set_destport(msg, peer_port); + msg_set_type(msg, TIPC_CONN_MSG); + msg_set_lookup_scope(msg, 0); + msg_set_hdr_sz(msg, SHORT_H_SIZE); - if (msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&port->phdr, (u32)msg_importance(msg)); - sock->state = SS_CONNECTED; - return 0; + port->probing_interval = CONN_PROBING_INTERVAL; + port->probing_state = TIPC_CONN_OK; + port->connected = 1; + k_start_timer(&port->timer, port->probing_interval); + tipc_node_add_conn(peer_node, port->ref, peer_port); + port->max_pkt = tipc_node_get_mtu(peer_node, port->ref); } /** @@ -1405,7 +1404,6 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) struct tipc_msg *msg = buf_msg(*buf); int retval = -TIPC_ERR_NO_PORT; - int res; if (msg_mcast(msg)) return retval; @@ -1416,13 +1414,20 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) if (msg_connected(msg) && tipc_port_peer_msg(port, msg)) { if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; - __tipc_port_disconnect(port); + port->connected = 0; + /* let timer expire on it's own */ + tipc_node_remove_conn(tipc_port_peernode(port), + port->ref); } retval = TIPC_OK; } break; case SS_CONNECTING: /* Accept only ACK or NACK message */ + + if (unlikely(!msg_connected(msg))) + break; + if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; sk->sk_err = ECONNREFUSED; @@ -1430,17 +1435,17 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) break; } - if (unlikely(!msg_connected(msg))) - break; - - res = auto_connect(tsk, msg); - if (res) { + if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) { sock->state = SS_DISCONNECTING; - sk->sk_err = -res; + sk->sk_err = EINVAL; retval = TIPC_OK; break; } + tipc_sk_finish_conn(port, msg_origport(msg), msg_orignode(msg)); + msg_set_importance(&port->phdr, msg_importance(msg)); + sock->state = SS_CONNECTED; + /* If an incoming message is an 'ACK-', it should be * discarded here because it doesn't contain useful * data. In addition, we should try to wake up @@ -1816,8 +1821,6 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) struct sk_buff *buf; struct tipc_port *new_port; struct tipc_msg *msg; - struct tipc_portid peer; - u32 new_ref; long timeo; int res; @@ -1840,7 +1843,6 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) new_sk = new_sock->sk; new_port = &tipc_sk(new_sk)->port; - new_ref = new_port->ref; msg = buf_msg(buf); /* we lock on new_sk; but lockdep sees the lock on sk */ @@ -1853,9 +1855,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) reject_rx_queue(new_sk); /* Connect new socket to it's peer */ - peer.ref = msg_origport(msg); - peer.node = msg_orignode(msg); - tipc_port_connect(new_ref, &peer); + tipc_sk_finish_conn(new_port, msg_origport(msg), msg_orignode(msg)); new_sock->state = SS_CONNECTED; tipc_port_set_importance(new_port, msg_importance(msg)); @@ -1919,9 +1919,9 @@ restart: kfree_skb(buf); goto restart; } - tipc_port_disconnect(port->ref); if (tipc_msg_reverse(buf, &dnode, TIPC_CONN_SHUTDOWN)) tipc_link_xmit(buf, dnode, port->ref); + tipc_node_remove_conn(dnode, port->ref); } else { dnode = tipc_port_peernode(port); buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, @@ -1930,11 +1930,10 @@ restart: tipc_port_peerport(port), port->ref, TIPC_CONN_SHUTDOWN); tipc_link_xmit(buf, dnode, port->ref); - __tipc_port_disconnect(port); } - + port->connected = 0; sock->state = SS_DISCONNECTING; - + tipc_node_remove_conn(dnode, port->ref); /* fall through */ case SS_DISCONNECTING: -- cgit v1.2.3 From 739f5e4efc82c4cb6b5201cbed337b6ff663bf19 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:12 -0400 Subject: tipc: redefine message acknowledge function The function tipc_acknowledge() is a remnant from the obsolete native API. Currently, it grabs port_lock, before building an acknowledge message and sending it to the peer. Since all access to socket members now is protected by the socket lock, it has become unnecessary to grab port_lock here. In this commit, we remove the usage of port_lock, simplify the function, and move it to socket.c, renaming it to tipc_sk_send_ack(). Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.c | 22 ---------------------- net/tipc/port.h | 2 -- net/tipc/socket.c | 22 ++++++++++++++++++++-- 3 files changed, 20 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/tipc/port.c b/net/tipc/port.c index 2f96719a3514..1074ccb0c76f 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -221,28 +221,6 @@ void tipc_port_reinit(void) spin_unlock_bh(&tipc_port_list_lock); } -void tipc_acknowledge(u32 ref, u32 ack) -{ - struct tipc_port *p_ptr; - struct sk_buff *buf = NULL; - struct tipc_msg *msg; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return; - if (p_ptr->connected) - buf = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, - 0, tipc_port_peernode(p_ptr), - tipc_own_addr, tipc_port_peerport(p_ptr), - p_ptr->ref, TIPC_OK); - tipc_port_unlock(p_ptr); - if (!buf) - return; - msg = buf_msg(buf); - msg_set_msgcnt(msg, ack); - tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); -} - int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, struct tipc_name_seq const *seq) { diff --git a/net/tipc/port.h b/net/tipc/port.h index b356cb8340d9..c92e1721f672 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -92,8 +92,6 @@ struct tipc_port_list; u32 tipc_port_init(struct tipc_port *p_ptr, const unsigned int importance); -void tipc_acknowledge(u32 port_ref, u32 ack); - void tipc_port_destroy(struct tipc_port *p_ptr); int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a65105818fe5..686a11be5c58 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1106,6 +1106,24 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } +static void tipc_sk_send_ack(struct tipc_port *port, uint ack) +{ + struct sk_buff *buf = NULL; + struct tipc_msg *msg; + u32 peer_port = tipc_port_peerport(port); + u32 dnode = tipc_port_peernode(port); + + if (!port->connected) + return; + buf = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0, dnode, + tipc_own_addr, peer_port, port->ref, TIPC_OK); + if (!buf) + return; + msg = buf_msg(buf); + msg_set_msgcnt(msg, ack); + tipc_link_xmit(buf, dnode, msg_link_selector(msg)); +} + static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) { struct sock *sk = sock->sk; @@ -1226,7 +1244,7 @@ restart: if (likely(!(flags & MSG_PEEK))) { if ((sock->state != SS_READY) && (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_acknowledge(port->ref, tsk->rcv_unacked); + tipc_sk_send_ack(port, tsk->rcv_unacked); tsk->rcv_unacked = 0; } advance_rx_queue(sk); @@ -1337,7 +1355,7 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_acknowledge(port->ref, tsk->rcv_unacked); + tipc_sk_send_ack(port, tsk->rcv_unacked); tsk->rcv_unacked = 0; } advance_rx_queue(sk); -- cgit v1.2.3 From 5b8fa7ce823a59a328e0a7661df2478bfb745de4 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:13 -0400 Subject: tipc: eliminate functions tipc_port_init and tipc_port_destroy After the latest changes to the socket/port layer the existence of the functions tipc_port_init() and tipc_port_destroy() cannot be justified. They are both called only once, from tipc_sk_create() and tipc_sk_delete() respectively, and their functionality can better be merged into the latter two functions. This also entails that all remaining references to port_lock now are made from inside socket.c, something that will make it easier to remove this lock. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.c | 75 ++----------------------------------------------------- net/tipc/port.h | 4 +-- net/tipc/socket.c | 51 ++++++++++++++++++++++++++++--------- 3 files changed, 42 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/tipc/port.c b/net/tipc/port.c index 1074ccb0c76f..1efa2982d2d2 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -42,10 +42,6 @@ #define MAX_REJECT_SIZE 1024 -DEFINE_SPINLOCK(tipc_port_list_lock); - -static LIST_HEAD(ports); - /** * tipc_port_peer_msg - verify message was sent by connected port's peer * @@ -67,73 +63,6 @@ int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg) (!peernode && (orignode == tipc_own_addr)); } -/* tipc_port_init - intiate TIPC port and lock it - * - * Returns obtained reference if initialization is successful, zero otherwise - */ -u32 tipc_port_init(struct tipc_port *p_ptr, - const unsigned int importance) -{ - struct tipc_msg *msg; - u32 ref; - - ref = tipc_ref_acquire(p_ptr, &p_ptr->lock); - if (!ref) { - pr_warn("Port registration failed, ref. table exhausted\n"); - return 0; - } - - p_ptr->max_pkt = MAX_PKT_DEFAULT; - p_ptr->ref = ref; - INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); - INIT_LIST_HEAD(&p_ptr->publications); - INIT_LIST_HEAD(&p_ptr->port_list); - - /* - * Must hold port list lock while initializing message header template - * to ensure a change to node's own network address doesn't result - * in template containing out-dated network address information - */ - spin_lock_bh(&tipc_port_list_lock); - msg = &p_ptr->phdr; - tipc_msg_init(msg, importance, TIPC_NAMED_MSG, NAMED_H_SIZE, 0); - msg_set_origport(msg, ref); - list_add_tail(&p_ptr->port_list, &ports); - spin_unlock_bh(&tipc_port_list_lock); - return ref; -} - -void tipc_port_destroy(struct tipc_port *p_ptr) -{ - struct sk_buff *buf = NULL; - struct tipc_msg *msg = NULL; - u32 peer_node; - - tipc_withdraw(p_ptr, 0, NULL); - - spin_lock_bh(p_ptr->lock); - tipc_ref_discard(p_ptr->ref); - spin_unlock_bh(p_ptr->lock); - - k_cancel_timer(&p_ptr->timer); - if (p_ptr->connected) { - peer_node = tipc_port_peernode(p_ptr); - buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, - SHORT_H_SIZE, 0, peer_node, - tipc_own_addr, tipc_port_peerport(p_ptr), - p_ptr->ref, TIPC_ERR_NO_PORT); - if (buf) { - msg = buf_msg(buf); - tipc_link_xmit(buf, peer_node, msg_link_selector(msg)); - } - tipc_node_remove_conn(peer_node, p_ptr->ref); - } - spin_lock_bh(&tipc_port_list_lock); - list_del(&p_ptr->port_list); - spin_unlock_bh(&tipc_port_list_lock); - k_term_timer(&p_ptr->timer); -} - static int port_print(struct tipc_port *p_ptr, char *buf, int len, int full_id) { struct publication *publ; @@ -194,7 +123,7 @@ struct sk_buff *tipc_port_get_ports(void) pb_len = ULTRA_STRING_MAX_LEN; spin_lock_bh(&tipc_port_list_lock); - list_for_each_entry(p_ptr, &ports, port_list) { + list_for_each_entry(p_ptr, &tipc_socks, port_list) { spin_lock_bh(p_ptr->lock); str_len += port_print(p_ptr, pb, pb_len, 0); spin_unlock_bh(p_ptr->lock); @@ -213,7 +142,7 @@ void tipc_port_reinit(void) struct tipc_msg *msg; spin_lock_bh(&tipc_port_list_lock); - list_for_each_entry(p_ptr, &ports, port_list) { + list_for_each_entry(p_ptr, &tipc_socks, port_list) { msg = &p_ptr->phdr; msg_set_prevnode(msg, tipc_own_addr); msg_set_orignode(msg, tipc_own_addr); diff --git a/net/tipc/port.h b/net/tipc/port.h index c92e1721f672..4c6cfdd36351 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -63,7 +63,6 @@ * @probing_state: * @probing_interval: * @timer_ref: - * @subscription: "node down" subscription used to terminate failed connections */ struct tipc_port { spinlock_t *lock; @@ -80,11 +79,10 @@ struct tipc_port { u32 probing_state; u32 probing_interval; struct timer_list timer; - struct tipc_node_subscr subscription; }; +extern struct list_head tipc_socks; extern spinlock_t tipc_port_list_lock; -struct tipc_port_list; /* * TIPC port manipulation routines diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 686a11be5c58..f2be4c2e20bb 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -63,6 +63,9 @@ static const struct proto_ops msg_ops; static struct proto tipc_proto; static struct proto tipc_proto_kern; +DEFINE_SPINLOCK(tipc_port_list_lock); +LIST_HEAD(tipc_socks); + /* * Revised TIPC socket locking policy: * @@ -156,6 +159,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, struct sock *sk; struct tipc_sock *tsk; struct tipc_port *port; + struct tipc_msg *msg; u32 ref; /* Validate arguments */ @@ -191,18 +195,28 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); port = &tsk->port; - - ref = tipc_port_init(port, TIPC_LOW_IMPORTANCE); + ref = tipc_ref_acquire(port, &port->lock); if (!ref) { - pr_warn("Socket registration failed, ref. table exhausted\n"); - sk_free(sk); + pr_warn("Socket create failed; reference table exhausted\n"); return -ENOMEM; } + port->max_pkt = MAX_PKT_DEFAULT; + port->ref = ref; + INIT_LIST_HEAD(&port->publications); + INIT_LIST_HEAD(&port->port_list); + + /* Guard against race during node address update */ + spin_lock_bh(&tipc_port_list_lock); + msg = &port->phdr; + tipc_msg_init(msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); + msg_set_origport(msg, ref); + list_add_tail(&port->port_list, &tipc_socks); + spin_unlock_bh(&tipc_port_list_lock); /* Finish initializing socket data structures */ sock->ops = ops; sock->state = state; - sock_init_data(sock, sk); k_init_timer(&port->timer, (Handler)tipc_sk_timeout, ref); sk->sk_backlog_rcv = tipc_backlog_rcv; @@ -330,6 +344,7 @@ static int tipc_release(struct socket *sock) * Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer) */ + dnode = tipc_port_peernode(port); while (sock->state != SS_DISCONNECTING) { buf = __skb_dequeue(&sk->sk_receive_queue); if (buf == NULL) @@ -341,18 +356,31 @@ static int tipc_release(struct socket *sock) (sock->state == SS_CONNECTED)) { sock->state = SS_DISCONNECTING; port->connected = 0; - tipc_node_remove_conn(tipc_port_peernode(port), - port->ref); + tipc_node_remove_conn(dnode, port->ref); } if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) tipc_link_xmit(buf, dnode, 0); } } - /* Destroy TIPC port; also disconnects an active connection and - * sends a 'FIN-' to peer. - */ - tipc_port_destroy(port); + tipc_withdraw(port, 0, NULL); + spin_lock_bh(port->lock); + tipc_ref_discard(port->ref); + spin_unlock_bh(port->lock); + k_cancel_timer(&port->timer); + if (port->connected) { + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, dnode, tipc_own_addr, + tipc_port_peerport(port), + port->ref, TIPC_ERR_NO_PORT); + if (buf) + tipc_link_xmit(buf, dnode, port->ref); + tipc_node_remove_conn(dnode, port->ref); + } + spin_lock_bh(&tipc_port_list_lock); + list_del(&port->port_list); + spin_unlock_bh(&tipc_port_list_lock); + k_term_timer(&port->timer); /* Discard any remaining (connection-based) messages in receive queue */ __skb_queue_purge(&sk->sk_receive_queue); @@ -360,7 +388,6 @@ static int tipc_release(struct socket *sock) /* Reject any messages that accumulated in backlog queue */ sock->state = SS_DISCONNECTING; release_sock(sk); - sock_put(sk); sock->sk = NULL; -- cgit v1.2.3 From 5a9ee0be3371eb77d671a77e26261931c5c3fb31 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:14 -0400 Subject: tipc: use registry when scanning sockets The functions tipc_port_get_ports() and tipc_port_reinit() scan over all sockets/ports to access each of them. This is done by using a dedicated linked list, 'tipc_socks' where all sockets are members. The list is in turn protected by a spinlock, 'port_list_lock', while each socket is locked by using port_lock at the moment of access. In order to reduce complexity and risk of deadlock, we want to get rid of the linked list and the accompanying spinlock. This is what we do in this commit. Instead of the linked list, we use the port registry to scan across the sockets. We also add usage of bh_lock_sock() inside the scope of port_lock in both functions, as a preparation for the complete removal of port_lock. Finally, we move the functions from port.c to socket.c, and rename them to tipc_sk_sock_show() and tipc_sk_reinit() repectively. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/config.c | 4 +- net/tipc/net.c | 2 +- net/tipc/port.c | 87 ---------------------------------------- net/tipc/port.h | 5 --- net/tipc/ref.c | 20 +++++++++ net/tipc/ref.h | 1 + net/tipc/socket.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++----- net/tipc/socket.h | 3 +- 8 files changed, 133 insertions(+), 107 deletions(-) (limited to 'net') diff --git a/net/tipc/config.c b/net/tipc/config.c index 2b42403ad33a..876f4c6a2631 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -35,7 +35,7 @@ */ #include "core.h" -#include "port.h" +#include "socket.h" #include "name_table.h" #include "config.h" #include "server.h" @@ -266,7 +266,7 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area rep_tlv_buf = tipc_media_get_names(); break; case TIPC_CMD_SHOW_PORTS: - rep_tlv_buf = tipc_port_get_ports(); + rep_tlv_buf = tipc_sk_socks_show(); break; case TIPC_CMD_SHOW_STATS: rep_tlv_buf = tipc_show_stats(); diff --git a/net/tipc/net.c b/net/tipc/net.c index 7fcc94998fea..421dd89152ac 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -111,7 +111,7 @@ int tipc_net_start(u32 addr) tipc_own_addr = addr; tipc_named_reinit(); - tipc_port_reinit(); + tipc_sk_reinit(); res = tipc_bclink_init(); if (res) return res; diff --git a/net/tipc/port.c b/net/tipc/port.c index 1efa2982d2d2..cea3730fe026 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -63,93 +63,6 @@ int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg) (!peernode && (orignode == tipc_own_addr)); } -static int port_print(struct tipc_port *p_ptr, char *buf, int len, int full_id) -{ - struct publication *publ; - int ret; - - if (full_id) - ret = tipc_snprintf(buf, len, "<%u.%u.%u:%u>:", - tipc_zone(tipc_own_addr), - tipc_cluster(tipc_own_addr), - tipc_node(tipc_own_addr), p_ptr->ref); - else - ret = tipc_snprintf(buf, len, "%-10u:", p_ptr->ref); - - if (p_ptr->connected) { - u32 dport = tipc_port_peerport(p_ptr); - u32 destnode = tipc_port_peernode(p_ptr); - - ret += tipc_snprintf(buf + ret, len - ret, - " connected to <%u.%u.%u:%u>", - tipc_zone(destnode), - tipc_cluster(destnode), - tipc_node(destnode), dport); - if (p_ptr->conn_type != 0) - ret += tipc_snprintf(buf + ret, len - ret, - " via {%u,%u}", p_ptr->conn_type, - p_ptr->conn_instance); - } else if (p_ptr->published) { - ret += tipc_snprintf(buf + ret, len - ret, " bound to"); - list_for_each_entry(publ, &p_ptr->publications, pport_list) { - if (publ->lower == publ->upper) - ret += tipc_snprintf(buf + ret, len - ret, - " {%u,%u}", publ->type, - publ->lower); - else - ret += tipc_snprintf(buf + ret, len - ret, - " {%u,%u,%u}", publ->type, - publ->lower, publ->upper); - } - } - ret += tipc_snprintf(buf + ret, len - ret, "\n"); - return ret; -} - -struct sk_buff *tipc_port_get_ports(void) -{ - struct sk_buff *buf; - struct tlv_desc *rep_tlv; - char *pb; - int pb_len; - struct tipc_port *p_ptr; - int str_len = 0; - - buf = tipc_cfg_reply_alloc(TLV_SPACE(ULTRA_STRING_MAX_LEN)); - if (!buf) - return NULL; - rep_tlv = (struct tlv_desc *)buf->data; - pb = TLV_DATA(rep_tlv); - pb_len = ULTRA_STRING_MAX_LEN; - - spin_lock_bh(&tipc_port_list_lock); - list_for_each_entry(p_ptr, &tipc_socks, port_list) { - spin_lock_bh(p_ptr->lock); - str_len += port_print(p_ptr, pb, pb_len, 0); - spin_unlock_bh(p_ptr->lock); - } - spin_unlock_bh(&tipc_port_list_lock); - str_len += 1; /* for "\0" */ - skb_put(buf, TLV_SPACE(str_len)); - TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); - - return buf; -} - -void tipc_port_reinit(void) -{ - struct tipc_port *p_ptr; - struct tipc_msg *msg; - - spin_lock_bh(&tipc_port_list_lock); - list_for_each_entry(p_ptr, &tipc_socks, port_list) { - msg = &p_ptr->phdr; - msg_set_prevnode(msg, tipc_own_addr); - msg_set_orignode(msg, tipc_own_addr); - } - spin_unlock_bh(&tipc_port_list_lock); -} - int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, struct tipc_name_seq const *seq) { diff --git a/net/tipc/port.h b/net/tipc/port.h index 4c6cfdd36351..4a3c54e5d69c 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -73,7 +73,6 @@ struct tipc_port { u32 max_pkt; u32 ref; struct tipc_msg phdr; - struct list_head port_list; struct list_head publications; u32 pub_count; u32 probing_state; @@ -81,9 +80,6 @@ struct tipc_port { struct timer_list timer; }; -extern struct list_head tipc_socks; -extern spinlock_t tipc_port_list_lock; - /* * TIPC port manipulation routines */ @@ -100,7 +96,6 @@ int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); -struct sk_buff *tipc_port_get_ports(void); void tipc_port_reinit(void); /** diff --git a/net/tipc/ref.c b/net/tipc/ref.c index 3d4ecd754eee..7fc2740846e3 100644 --- a/net/tipc/ref.c +++ b/net/tipc/ref.c @@ -264,3 +264,23 @@ void *tipc_ref_lock(u32 ref) } return NULL; } + +/* tipc_ref_lock_next - lock & return next object after referenced one +*/ +void *tipc_ref_lock_next(u32 *ref) +{ + struct reference *entry; + uint index = *ref & tipc_ref_table.index_mask; + + while (++index < tipc_ref_table.capacity) { + entry = &tipc_ref_table.entries[index]; + if (!entry->object) + continue; + spin_lock_bh(&entry->lock); + *ref = entry->ref; + if (entry->object) + return entry->object; + spin_unlock_bh(&entry->lock); + } + return NULL; +} diff --git a/net/tipc/ref.h b/net/tipc/ref.h index d01aa1df63b8..e236fa520a1d 100644 --- a/net/tipc/ref.h +++ b/net/tipc/ref.h @@ -44,5 +44,6 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock); void tipc_ref_discard(u32 ref); void *tipc_ref_lock(u32 ref); +void *tipc_ref_lock_next(u32 *ref); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f2be4c2e20bb..ddc2f8c6fced 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -40,6 +40,7 @@ #include "node.h" #include "link.h" #include +#include "config.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -63,9 +64,6 @@ static const struct proto_ops msg_ops; static struct proto tipc_proto; static struct proto tipc_proto_kern; -DEFINE_SPINLOCK(tipc_port_list_lock); -LIST_HEAD(tipc_socks); - /* * Revised TIPC socket locking policy: * @@ -113,6 +111,17 @@ LIST_HEAD(tipc_socks); #include "socket.h" +/* tipc_sk_lock_next: find & lock next socket in registry from given port number +*/ +static struct tipc_sock *tipc_sk_lock_next(u32 *ref) +{ + struct tipc_port *port = (struct tipc_port *)tipc_ref_lock_next(ref); + + if (!port) + return NULL; + return tipc_port_to_sock(port); +} + /** * advance_rx_queue - discard first buffer in socket receive queue * @@ -203,16 +212,11 @@ static int tipc_sk_create(struct net *net, struct socket *sock, port->max_pkt = MAX_PKT_DEFAULT; port->ref = ref; INIT_LIST_HEAD(&port->publications); - INIT_LIST_HEAD(&port->port_list); - /* Guard against race during node address update */ - spin_lock_bh(&tipc_port_list_lock); msg = &port->phdr; tipc_msg_init(msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, NAMED_H_SIZE, 0); msg_set_origport(msg, ref); - list_add_tail(&port->port_list, &tipc_socks); - spin_unlock_bh(&tipc_port_list_lock); /* Finish initializing socket data structures */ sock->ops = ops; @@ -377,9 +381,6 @@ static int tipc_release(struct socket *sock) tipc_link_xmit(buf, dnode, port->ref); tipc_node_remove_conn(dnode, port->ref); } - spin_lock_bh(&tipc_port_list_lock); - list_del(&port->port_list); - spin_unlock_bh(&tipc_port_list_lock); k_term_timer(&port->timer); /* Discard any remaining (connection-based) messages in receive queue */ @@ -2043,6 +2044,101 @@ static void tipc_sk_timeout(unsigned long ref) tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); } +static int tipc_sk_show(struct tipc_port *port, char *buf, + int len, int full_id) +{ + struct publication *publ; + int ret; + + if (full_id) + ret = tipc_snprintf(buf, len, "<%u.%u.%u:%u>:", + tipc_zone(tipc_own_addr), + tipc_cluster(tipc_own_addr), + tipc_node(tipc_own_addr), port->ref); + else + ret = tipc_snprintf(buf, len, "%-10u:", port->ref); + + if (port->connected) { + u32 dport = tipc_port_peerport(port); + u32 destnode = tipc_port_peernode(port); + + ret += tipc_snprintf(buf + ret, len - ret, + " connected to <%u.%u.%u:%u>", + tipc_zone(destnode), + tipc_cluster(destnode), + tipc_node(destnode), dport); + if (port->conn_type != 0) + ret += tipc_snprintf(buf + ret, len - ret, + " via {%u,%u}", port->conn_type, + port->conn_instance); + } else if (port->published) { + ret += tipc_snprintf(buf + ret, len - ret, " bound to"); + list_for_each_entry(publ, &port->publications, pport_list) { + if (publ->lower == publ->upper) + ret += tipc_snprintf(buf + ret, len - ret, + " {%u,%u}", publ->type, + publ->lower); + else + ret += tipc_snprintf(buf + ret, len - ret, + " {%u,%u,%u}", publ->type, + publ->lower, publ->upper); + } + } + ret += tipc_snprintf(buf + ret, len - ret, "\n"); + return ret; +} + +struct sk_buff *tipc_sk_socks_show(void) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + char *pb; + int pb_len; + struct tipc_sock *tsk; + int str_len = 0; + u32 ref = 0; + + buf = tipc_cfg_reply_alloc(TLV_SPACE(ULTRA_STRING_MAX_LEN)); + if (!buf) + return NULL; + rep_tlv = (struct tlv_desc *)buf->data; + pb = TLV_DATA(rep_tlv); + pb_len = ULTRA_STRING_MAX_LEN; + + tsk = tipc_sk_lock_next(&ref); + for (; tsk; tsk = tipc_sk_lock_next(&ref)) { + bh_lock_sock(&tsk->sk); + str_len += tipc_sk_show(&tsk->port, pb + str_len, + pb_len - str_len, 0); + bh_unlock_sock(&tsk->sk); + tipc_port_unlock(&tsk->port); + } + str_len += 1; /* for "\0" */ + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +/* tipc_sk_reinit: set non-zero address in all existing sockets + * when we go from standalone to network mode. + */ +void tipc_sk_reinit(void) +{ + struct tipc_msg *msg; + u32 ref = 0; + struct tipc_sock *tsk = tipc_sk_lock_next(&ref); + + for (; tsk; tsk = tipc_sk_lock_next(&ref)) { + bh_lock_sock(&tsk->sk); + msg = &tsk->port.phdr; + msg_set_prevnode(msg, tipc_own_addr); + msg_set_orignode(msg, tipc_own_addr); + bh_unlock_sock(&tsk->sk); + tipc_port_unlock(&tsk->port); + } +} + /** * tipc_setsockopt - set socket option * @sock: socket structure diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 1405633362f5..5d515be604a9 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -79,7 +79,8 @@ static inline int tipc_sk_conn_cong(struct tipc_sock *tsk) } int tipc_sk_rcv(struct sk_buff *buf); - +struct sk_buff *tipc_sk_socks_show(void); void tipc_sk_mcast_rcv(struct sk_buff *buf); +void tipc_sk_reinit(void); #endif -- cgit v1.2.3 From 9b50fd087a9f1454d6a8b613fff376dfb6d6ea93 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:15 -0400 Subject: tipc: replace port pointer with socket pointer in registry In order to make tipc_sock the only entity referencable from other parts of the stack, we add a tipc_sock pointer instead of a tipc_port pointer to the registry. As a consequence, we also let the function tipc_port_lock() return a pointer to a tipc_sock instead of a tipc_port. We keep the function's name for now, since the lock still is owned by the port. This is another step in the direction of eliminating port_lock, replacing its usage with lock_sock() and bh_lock_sock(). Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.h | 4 ++-- net/tipc/socket.c | 23 +++++++++-------------- 2 files changed, 11 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/tipc/port.h b/net/tipc/port.h index 4a3c54e5d69c..33e52fe50e10 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -101,9 +101,9 @@ void tipc_port_reinit(void); /** * tipc_port_lock - lock port instance referred to and return its pointer */ -static inline struct tipc_port *tipc_port_lock(u32 ref) +static inline struct tipc_sock *tipc_port_lock(u32 ref) { - return (struct tipc_port *)tipc_ref_lock(ref); + return (struct tipc_sock *)tipc_ref_lock(ref); } /** diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ddc2f8c6fced..247f245ff596 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -115,11 +115,7 @@ static struct proto tipc_proto_kern; */ static struct tipc_sock *tipc_sk_lock_next(u32 *ref) { - struct tipc_port *port = (struct tipc_port *)tipc_ref_lock_next(ref); - - if (!port) - return NULL; - return tipc_port_to_sock(port); + return (struct tipc_sock *)tipc_ref_lock_next(ref); } /** @@ -204,7 +200,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); port = &tsk->port; - ref = tipc_ref_acquire(port, &port->lock); + ref = tipc_ref_acquire(tsk, &port->lock); if (!ref) { pr_warn("Socket create failed; reference table exhausted\n"); return -ENOMEM; @@ -1655,13 +1651,12 @@ int tipc_sk_rcv(struct sk_buff *buf) u32 dnode; /* Validate destination and message */ - port = tipc_port_lock(dport); - if (unlikely(!port)) { + tsk = tipc_port_lock(dport); + if (unlikely(!tsk)) { rc = tipc_msg_eval(buf, &dnode); goto exit; } - - tsk = tipc_port_to_sock(port); + port = &tsk->port; sk = &tsk->sk; /* Queue message */ @@ -2002,21 +1997,21 @@ restart: static void tipc_sk_timeout(unsigned long ref) { - struct tipc_port *port = tipc_port_lock(ref); - struct tipc_sock *tsk; + struct tipc_sock *tsk = tipc_port_lock(ref); + struct tipc_port *port; struct sock *sk; struct sk_buff *buf = NULL; struct tipc_msg *msg = NULL; u32 peer_port, peer_node; - if (!port) + if (!tsk) return; + port = &tsk->port; if (!port->connected) { tipc_port_unlock(port); return; } - tsk = tipc_port_to_sock(port); sk = &tsk->sk; bh_lock_sock(sk); peer_port = tipc_port_peerport(port); -- cgit v1.2.3 From 6c9808ce09f778a1de7b207b82cfc36a59cda2d3 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:16 -0400 Subject: tipc: remove port_lock In previous commits we have reduced usage of port_lock to a minimum, and complemented it with usage of bh_lock_sock() at the remaining locations. The purpose has been to remove this lock altogether, since it largely duplicates the role of bh_lock_sock. We are now ready to do this. However, we still need to protect the BH callers from inadvertent release of the socket while they hold a reference to it. We do this by replacing port_lock by a combination of a rw-lock protecting the reference table as such, and updating the socket reference counter while the socket is referenced from BH. This technique is more standard and comprehensible than the previous approach, and turns out to have a positive effect on overall performance. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/port.h | 20 ------- net/tipc/ref.c | 158 ++++++++++++++++++++++++------------------------------ net/tipc/ref.h | 15 ++++-- net/tipc/socket.c | 64 +++++++++------------- 4 files changed, 108 insertions(+), 149 deletions(-) (limited to 'net') diff --git a/net/tipc/port.h b/net/tipc/port.h index 33e52fe50e10..38bf8cb3df1a 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -37,7 +37,6 @@ #ifndef _TIPC_PORT_H #define _TIPC_PORT_H -#include "ref.h" #include "net.h" #include "msg.h" #include "node_subscr.h" @@ -65,7 +64,6 @@ * @timer_ref: */ struct tipc_port { - spinlock_t *lock; int connected; u32 conn_type; u32 conn_instance; @@ -98,24 +96,6 @@ int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); void tipc_port_reinit(void); -/** - * tipc_port_lock - lock port instance referred to and return its pointer - */ -static inline struct tipc_sock *tipc_port_lock(u32 ref) -{ - return (struct tipc_sock *)tipc_ref_lock(ref); -} - -/** - * tipc_port_unlock - unlock a port instance - * - * Can use pointer instead of tipc_ref_unlock() since port is already locked. - */ -static inline void tipc_port_unlock(struct tipc_port *p_ptr) -{ - spin_unlock_bh(p_ptr->lock); -} - static inline u32 tipc_port_peernode(struct tipc_port *p_ptr) { return msg_destnode(&p_ptr->phdr); diff --git a/net/tipc/ref.c b/net/tipc/ref.c index 7fc2740846e3..ea981bed967b 100644 --- a/net/tipc/ref.c +++ b/net/tipc/ref.c @@ -1,7 +1,7 @@ /* - * net/tipc/ref.c: TIPC object registry code + * net/tipc/ref.c: TIPC socket registry code * - * Copyright (c) 1991-2006, Ericsson AB + * Copyright (c) 1991-2006, 2014, Ericsson AB * Copyright (c) 2004-2007, Wind River Systems * All rights reserved. * @@ -38,24 +38,22 @@ #include "ref.h" /** - * struct reference - TIPC object reference entry - * @object: pointer to object associated with reference entry - * @lock: spinlock controlling access to object - * @ref: reference value for object (combines instance & array index info) + * struct reference - TIPC socket reference entry + * @tsk: pointer to socket associated with reference entry + * @ref: reference value for socket (combines instance & array index info) */ struct reference { - void *object; - spinlock_t lock; + struct tipc_sock *tsk; u32 ref; }; /** - * struct tipc_ref_table - table of TIPC object reference entries + * struct tipc_ref_table - table of TIPC socket reference entries * @entries: pointer to array of reference entries * @capacity: array index of first unusable entry * @init_point: array index of first uninitialized entry - * @first_free: array index of first unused object reference entry - * @last_free: array index of last unused object reference entry + * @first_free: array index of first unused socket reference entry + * @last_free: array index of last unused socket reference entry * @index_mask: bitmask for array index portion of reference values * @start_mask: initial value for instance value portion of reference values */ @@ -70,9 +68,9 @@ struct ref_table { }; /* - * Object reference table consists of 2**N entries. + * Socket reference table consists of 2**N entries. * - * State Object ptr Reference + * State Socket ptr Reference * ----- ---------- --------- * In use non-NULL XXXX|own index * (XXXX changes each time entry is acquired) @@ -89,10 +87,10 @@ struct ref_table { static struct ref_table tipc_ref_table; -static DEFINE_SPINLOCK(ref_table_lock); +static DEFINE_RWLOCK(ref_table_lock); /** - * tipc_ref_table_init - create reference table for objects + * tipc_ref_table_init - create reference table for sockets */ int tipc_ref_table_init(u32 requested_size, u32 start) { @@ -122,84 +120,69 @@ int tipc_ref_table_init(u32 requested_size, u32 start) } /** - * tipc_ref_table_stop - destroy reference table for objects + * tipc_ref_table_stop - destroy reference table for sockets */ void tipc_ref_table_stop(void) { + if (!tipc_ref_table.entries) + return; vfree(tipc_ref_table.entries); tipc_ref_table.entries = NULL; } -/** - * tipc_ref_acquire - create reference to an object +/* tipc_ref_acquire - create reference to a socket * - * Register an object pointer in reference table and lock the object. + * Register an socket pointer in the reference table. * Returns a unique reference value that is used from then on to retrieve the - * object pointer, or to determine that the object has been deregistered. - * - * Note: The object is returned in the locked state so that the caller can - * register a partially initialized object, without running the risk that - * the object will be accessed before initialization is complete. + * socket pointer, or to determine if the socket has been deregistered. */ -u32 tipc_ref_acquire(void *object, spinlock_t **lock) +u32 tipc_ref_acquire(struct tipc_sock *tsk) { u32 index; u32 index_mask; u32 next_plus_upper; - u32 ref; - struct reference *entry = NULL; + u32 ref = 0; + struct reference *entry; - if (!object) { + if (unlikely(!tsk)) { pr_err("Attempt to acquire ref. to non-existent obj\n"); return 0; } - if (!tipc_ref_table.entries) { + if (unlikely(!tipc_ref_table.entries)) { pr_err("Ref. table not found in acquisition attempt\n"); return 0; } - /* take a free entry, if available; otherwise initialize a new entry */ - spin_lock_bh(&ref_table_lock); - if (tipc_ref_table.first_free) { + /* Take a free entry, if available; otherwise initialize a new one */ + write_lock_bh(&ref_table_lock); + index = tipc_ref_table.first_free; + entry = &tipc_ref_table.entries[index]; + + if (likely(index)) { index = tipc_ref_table.first_free; entry = &(tipc_ref_table.entries[index]); index_mask = tipc_ref_table.index_mask; next_plus_upper = entry->ref; tipc_ref_table.first_free = next_plus_upper & index_mask; ref = (next_plus_upper & ~index_mask) + index; + entry->tsk = tsk; } else if (tipc_ref_table.init_point < tipc_ref_table.capacity) { index = tipc_ref_table.init_point++; entry = &(tipc_ref_table.entries[index]); - spin_lock_init(&entry->lock); ref = tipc_ref_table.start_mask + index; - } else { - ref = 0; } - spin_unlock_bh(&ref_table_lock); - /* - * Grab the lock so no one else can modify this entry - * While we assign its ref value & object pointer - */ - if (entry) { - spin_lock_bh(&entry->lock); + if (ref) { entry->ref = ref; - entry->object = object; - *lock = &entry->lock; - /* - * keep it locked, the caller is responsible - * for unlocking this when they're done with it - */ + entry->tsk = tsk; } - + write_unlock_bh(&ref_table_lock); return ref; } -/** - * tipc_ref_discard - invalidate references to an object +/* tipc_ref_discard - invalidate reference to an socket * - * Disallow future references to an object and free up the entry for re-use. - * Note: The entry's spin_lock may still be busy after discard + * Disallow future references to an socket and free up the entry for re-use. */ void tipc_ref_discard(u32 ref) { @@ -207,7 +190,7 @@ void tipc_ref_discard(u32 ref) u32 index; u32 index_mask; - if (!tipc_ref_table.entries) { + if (unlikely(!tipc_ref_table.entries)) { pr_err("Ref. table not found during discard attempt\n"); return; } @@ -216,71 +199,72 @@ void tipc_ref_discard(u32 ref) index = ref & index_mask; entry = &(tipc_ref_table.entries[index]); - spin_lock_bh(&ref_table_lock); + write_lock_bh(&ref_table_lock); - if (!entry->object) { - pr_err("Attempt to discard ref. to non-existent obj\n"); + if (unlikely(!entry->tsk)) { + pr_err("Attempt to discard ref. to non-existent socket\n"); goto exit; } - if (entry->ref != ref) { + if (unlikely(entry->ref != ref)) { pr_err("Attempt to discard non-existent reference\n"); goto exit; } /* - * mark entry as unused; increment instance part of entry's reference + * Mark entry as unused; increment instance part of entry's reference * to invalidate any subsequent references */ - entry->object = NULL; + entry->tsk = NULL; entry->ref = (ref & ~index_mask) + (index_mask + 1); - /* append entry to free entry list */ - if (tipc_ref_table.first_free == 0) + /* Append entry to free entry list */ + if (unlikely(tipc_ref_table.first_free == 0)) tipc_ref_table.first_free = index; else tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index; tipc_ref_table.last_free = index; - exit: - spin_unlock_bh(&ref_table_lock); + write_unlock_bh(&ref_table_lock); } -/** - * tipc_ref_lock - lock referenced object and return pointer to it +/* tipc_sk_get - find referenced socket and return pointer to it */ -void *tipc_ref_lock(u32 ref) +struct tipc_sock *tipc_sk_get(u32 ref) { - if (likely(tipc_ref_table.entries)) { - struct reference *entry; + struct reference *entry; + struct tipc_sock *tsk; - entry = &tipc_ref_table.entries[ref & - tipc_ref_table.index_mask]; - if (likely(entry->ref != 0)) { - spin_lock_bh(&entry->lock); - if (likely((entry->ref == ref) && (entry->object))) - return entry->object; - spin_unlock_bh(&entry->lock); - } - } - return NULL; + if (unlikely(!tipc_ref_table.entries)) + return NULL; + read_lock_bh(&ref_table_lock); + entry = &tipc_ref_table.entries[ref & tipc_ref_table.index_mask]; + tsk = entry->tsk; + if (likely(tsk && (entry->ref == ref))) + sock_hold(&tsk->sk); + else + tsk = NULL; + read_unlock_bh(&ref_table_lock); + return tsk; } -/* tipc_ref_lock_next - lock & return next object after referenced one +/* tipc_sk_get_next - lock & return next socket after referenced one */ -void *tipc_ref_lock_next(u32 *ref) +struct tipc_sock *tipc_sk_get_next(u32 *ref) { struct reference *entry; + struct tipc_sock *tsk = NULL; uint index = *ref & tipc_ref_table.index_mask; + read_lock_bh(&ref_table_lock); while (++index < tipc_ref_table.capacity) { entry = &tipc_ref_table.entries[index]; - if (!entry->object) + if (!entry->tsk) continue; - spin_lock_bh(&entry->lock); + tsk = entry->tsk; + sock_hold(&tsk->sk); *ref = entry->ref; - if (entry->object) - return entry->object; - spin_unlock_bh(&entry->lock); + break; } - return NULL; + read_unlock_bh(&ref_table_lock); + return tsk; } diff --git a/net/tipc/ref.h b/net/tipc/ref.h index e236fa520a1d..2b75a892305a 100644 --- a/net/tipc/ref.h +++ b/net/tipc/ref.h @@ -1,7 +1,7 @@ /* * net/tipc/ref.h: Include file for TIPC object registry code * - * Copyright (c) 1991-2006, Ericsson AB + * Copyright (c) 1991-2006, 2014, Ericsson AB * Copyright (c) 2005-2006, Wind River Systems * All rights reserved. * @@ -37,13 +37,20 @@ #ifndef _TIPC_REF_H #define _TIPC_REF_H +#include "socket.h" + int tipc_ref_table_init(u32 requested_size, u32 start); void tipc_ref_table_stop(void); -u32 tipc_ref_acquire(void *object, spinlock_t **lock); +u32 tipc_ref_acquire(struct tipc_sock *tsk); void tipc_ref_discard(u32 ref); -void *tipc_ref_lock(u32 ref); -void *tipc_ref_lock_next(u32 *ref); +struct tipc_sock *tipc_sk_get(u32 ref); +struct tipc_sock *tipc_sk_get_next(u32 *ref); + +static inline void tipc_sk_put(struct tipc_sock *tsk) +{ + sock_put(&tsk->sk); +} #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 247f245ff596..7e6240e41e69 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -35,6 +35,7 @@ */ #include "core.h" +#include "ref.h" #include "port.h" #include "name_table.h" #include "node.h" @@ -111,13 +112,6 @@ static struct proto tipc_proto_kern; #include "socket.h" -/* tipc_sk_lock_next: find & lock next socket in registry from given port number -*/ -static struct tipc_sock *tipc_sk_lock_next(u32 *ref) -{ - return (struct tipc_sock *)tipc_ref_lock_next(ref); -} - /** * advance_rx_queue - discard first buffer in socket receive queue * @@ -200,7 +194,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); port = &tsk->port; - ref = tipc_ref_acquire(tsk, &port->lock); + ref = tipc_ref_acquire(tsk); if (!ref) { pr_warn("Socket create failed; reference table exhausted\n"); return -ENOMEM; @@ -226,7 +220,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); - tipc_port_unlock(port); if (sock->state == SS_READY) { tipc_port_set_unreturnable(port, true); @@ -364,9 +357,7 @@ static int tipc_release(struct socket *sock) } tipc_withdraw(port, 0, NULL); - spin_lock_bh(port->lock); tipc_ref_discard(port->ref); - spin_unlock_bh(port->lock); k_cancel_timer(&port->timer); if (port->connected) { buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1651,7 +1642,7 @@ int tipc_sk_rcv(struct sk_buff *buf) u32 dnode; /* Validate destination and message */ - tsk = tipc_port_lock(dport); + tsk = tipc_sk_get(dport); if (unlikely(!tsk)) { rc = tipc_msg_eval(buf, &dnode); goto exit; @@ -1672,8 +1663,7 @@ int tipc_sk_rcv(struct sk_buff *buf) rc = -TIPC_ERR_OVERLOAD; } bh_unlock_sock(sk); - tipc_port_unlock(port); - + tipc_sk_put(tsk); if (likely(!rc)) return 0; exit: @@ -1997,23 +1987,23 @@ restart: static void tipc_sk_timeout(unsigned long ref) { - struct tipc_sock *tsk = tipc_port_lock(ref); + struct tipc_sock *tsk; struct tipc_port *port; struct sock *sk; struct sk_buff *buf = NULL; - struct tipc_msg *msg = NULL; u32 peer_port, peer_node; + tsk = tipc_sk_get(ref); if (!tsk) - return; - + goto exit; + sk = &tsk->sk; port = &tsk->port; + + bh_lock_sock(sk); if (!port->connected) { - tipc_port_unlock(port); - return; + bh_unlock_sock(sk); + goto exit; } - sk = &tsk->sk; - bh_lock_sock(sk); peer_port = tipc_port_peerport(port); peer_node = tipc_port_peernode(port); @@ -2031,12 +2021,10 @@ static void tipc_sk_timeout(unsigned long ref) k_start_timer(&port->timer, port->probing_interval); } bh_unlock_sock(sk); - tipc_port_unlock(port); - if (!buf) - return; - - msg = buf_msg(buf); - tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); + if (buf) + tipc_link_xmit(buf, peer_node, ref); +exit: + tipc_sk_put(tsk); } static int tipc_sk_show(struct tipc_port *port, char *buf, @@ -2100,13 +2088,13 @@ struct sk_buff *tipc_sk_socks_show(void) pb = TLV_DATA(rep_tlv); pb_len = ULTRA_STRING_MAX_LEN; - tsk = tipc_sk_lock_next(&ref); - for (; tsk; tsk = tipc_sk_lock_next(&ref)) { - bh_lock_sock(&tsk->sk); + tsk = tipc_sk_get_next(&ref); + for (; tsk; tsk = tipc_sk_get_next(&ref)) { + lock_sock(&tsk->sk); str_len += tipc_sk_show(&tsk->port, pb + str_len, pb_len - str_len, 0); - bh_unlock_sock(&tsk->sk); - tipc_port_unlock(&tsk->port); + release_sock(&tsk->sk); + tipc_sk_put(tsk); } str_len += 1; /* for "\0" */ skb_put(buf, TLV_SPACE(str_len)); @@ -2122,15 +2110,15 @@ void tipc_sk_reinit(void) { struct tipc_msg *msg; u32 ref = 0; - struct tipc_sock *tsk = tipc_sk_lock_next(&ref); + struct tipc_sock *tsk = tipc_sk_get_next(&ref); - for (; tsk; tsk = tipc_sk_lock_next(&ref)) { - bh_lock_sock(&tsk->sk); + for (; tsk; tsk = tipc_sk_get_next(&ref)) { + lock_sock(&tsk->sk); msg = &tsk->port.phdr; msg_set_prevnode(msg, tipc_own_addr); msg_set_orignode(msg, tipc_own_addr); - bh_unlock_sock(&tsk->sk); - tipc_port_unlock(&tsk->port); + release_sock(&tsk->sk); + tipc_sk_put(tsk); } } -- cgit v1.2.3 From 0fc87aaebdfbf2c75112ce17aec093652c682acd Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:17 -0400 Subject: tipc: remove source file port.c In this commit, we move the remaining functions in port.c to socket.c, and give them new names that correspond to their new location. We then remove the file port.c. There are only cosmetic changes to the moved functions. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/Makefile | 2 +- net/tipc/port.c | 123 ------------------------------------------------------ net/tipc/socket.c | 102 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 97 insertions(+), 130 deletions(-) delete mode 100644 net/tipc/port.c (limited to 'net') diff --git a/net/tipc/Makefile b/net/tipc/Makefile index a080c66d819a..5206a4440222 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o config.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o name_table.o net.o \ - netlink.o node.o node_subscr.o port.o ref.o \ + netlink.o node.o node_subscr.o ref.o \ socket.o log.o eth_media.o server.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/port.c b/net/tipc/port.c deleted file mode 100644 index cea3730fe026..000000000000 --- a/net/tipc/port.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * net/tipc/port.c: TIPC port code - * - * Copyright (c) 1992-2007, 2014, Ericsson AB - * Copyright (c) 2004-2008, 2010-2013, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "core.h" -#include "config.h" -#include "port.h" -#include "name_table.h" -#include "socket.h" - -#define MAX_REJECT_SIZE 1024 - -/** - * tipc_port_peer_msg - verify message was sent by connected port's peer - * - * Handles cases where the node's network address has changed from - * the default of <0.0.0> to its configured setting. - */ -int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg) -{ - u32 peernode; - u32 orignode; - - if (msg_origport(msg) != tipc_port_peerport(p_ptr)) - return 0; - - orignode = msg_orignode(msg); - peernode = tipc_port_peernode(p_ptr); - return (orignode == peernode) || - (!orignode && (peernode == tipc_own_addr)) || - (!peernode && (orignode == tipc_own_addr)); -} - -int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *seq) -{ - struct publication *publ; - u32 key; - - if (p_ptr->connected) - return -EINVAL; - key = p_ptr->ref + p_ptr->pub_count + 1; - if (key == p_ptr->ref) - return -EADDRINUSE; - - publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, - scope, p_ptr->ref, key); - if (publ) { - list_add(&publ->pport_list, &p_ptr->publications); - p_ptr->pub_count++; - p_ptr->published = 1; - return 0; - } - return -EINVAL; -} - -int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *seq) -{ - struct publication *publ; - struct publication *tpubl; - int res = -EINVAL; - - if (!seq) { - list_for_each_entry_safe(publ, tpubl, - &p_ptr->publications, pport_list) { - tipc_nametbl_withdraw(publ->type, publ->lower, - publ->ref, publ->key); - } - res = 0; - } else { - list_for_each_entry_safe(publ, tpubl, - &p_ptr->publications, pport_list) { - if (publ->scope != scope) - continue; - if (publ->type != seq->type) - continue; - if (publ->lower != seq->lower) - continue; - if (publ->upper != seq->upper) - break; - tipc_nametbl_withdraw(publ->type, publ->lower, - publ->ref, publ->key); - res = 0; - break; - } - } - if (list_empty(&p_ptr->publications)) - p_ptr->published = 0; - return res; -} diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7e6240e41e69..ea33eab4fb9d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -57,6 +57,10 @@ static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); static void tipc_sk_timeout(unsigned long ref); +static int tipc_sk_publish(struct tipc_port *port, uint scope, + struct tipc_name_seq const *seq); +static int tipc_sk_withdraw(struct tipc_port *port, uint scope, + struct tipc_name_seq const *seq); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -138,6 +142,38 @@ static void reject_rx_queue(struct sock *sk) } } +/* tipc_sk_peer_msg - verify if message was sent by connected port's peer + * + * Handles cases where the node's network address has changed from + * the default of <0.0.0> to its configured setting. + */ +static bool tipc_sk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) +{ + u32 peer_port = tipc_port_peerport(&tsk->port); + u32 orig_node; + u32 peer_node; + + if (unlikely(!tsk->port.connected)) + return false; + + if (unlikely(msg_origport(msg) != peer_port)) + return false; + + orig_node = msg_orignode(msg); + peer_node = tipc_port_peernode(&tsk->port); + + if (likely(orig_node == peer_node)) + return true; + + if (!orig_node && (peer_node == tipc_own_addr)) + return true; + + if (!peer_node && (orig_node == tipc_own_addr)) + return true; + + return false; +} + /** * tipc_sk_create - create a TIPC socket * @net: network namespace (must be default network) @@ -356,7 +392,7 @@ static int tipc_release(struct socket *sock) } } - tipc_withdraw(port, 0, NULL); + tipc_sk_withdraw(port, 0, NULL); tipc_ref_discard(port->ref); k_cancel_timer(&port->timer); if (port->connected) { @@ -407,7 +443,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, lock_sock(sk); if (unlikely(!uaddr_len)) { - res = tipc_withdraw(&tsk->port, 0, NULL); + res = tipc_sk_withdraw(&tsk->port, 0, NULL); goto exit; } @@ -435,8 +471,8 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, } res = (addr->scope > 0) ? - tipc_publish(&tsk->port, addr->scope, &addr->addr.nameseq) : - tipc_withdraw(&tsk->port, -addr->scope, &addr->addr.nameseq); + tipc_sk_publish(&tsk->port, addr->scope, &addr->addr.nameseq) : + tipc_sk_withdraw(&tsk->port, -addr->scope, &addr->addr.nameseq); exit: release_sock(sk); return res; @@ -663,7 +699,7 @@ static int tipc_sk_proto_rcv(struct tipc_sock *tsk, u32 *dnode, int conn_cong; /* Ignore if connection cannot be validated: */ - if (!port->connected || !tipc_port_peer_msg(port, msg)) + if (!tipc_sk_peer_msg(tsk, msg)) goto exit; port->probing_state = TIPC_CONN_OK; @@ -1444,7 +1480,7 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) switch ((int)sock->state) { case SS_CONNECTED: /* Accept only connection-based messages sent by peer */ - if (msg_connected(msg) && tipc_port_peer_msg(port, msg)) { + if (tipc_sk_peer_msg(tsk, msg)) { if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; port->connected = 0; @@ -2027,6 +2063,60 @@ exit: tipc_sk_put(tsk); } +static int tipc_sk_publish(struct tipc_port *port, uint scope, + struct tipc_name_seq const *seq) +{ + struct publication *publ; + u32 key; + + if (port->connected) + return -EINVAL; + key = port->ref + port->pub_count + 1; + if (key == port->ref) + return -EADDRINUSE; + + publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, + scope, port->ref, key); + if (unlikely(!publ)) + return -EINVAL; + + list_add(&publ->pport_list, &port->publications); + port->pub_count++; + port->published = 1; + return 0; +} + +static int tipc_sk_withdraw(struct tipc_port *port, uint scope, + struct tipc_name_seq const *seq) +{ + struct publication *publ; + struct publication *safe; + int rc = -EINVAL; + + list_for_each_entry_safe(publ, safe, &port->publications, pport_list) { + if (seq) { + if (publ->scope != scope) + continue; + if (publ->type != seq->type) + continue; + if (publ->lower != seq->lower) + continue; + if (publ->upper != seq->upper) + break; + tipc_nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + rc = 0; + break; + } + tipc_nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + rc = 0; + } + if (list_empty(&port->publications)) + port->published = 0; + return rc; +} + static int tipc_sk_show(struct tipc_port *port, char *buf, int len, int full_id) { -- cgit v1.2.3 From 2e84c60b77e4dd96068f568a5971e681bb7e6b68 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:18 -0400 Subject: tipc: remove include file port.h We move the inline functions in the file port.h to socket.c, and modify their names accordingly. We move struct tipc_port and some macros to socket.h. Finally, we remove the file port.h. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 1 - net/tipc/core.c | 2 +- net/tipc/link.c | 1 - net/tipc/name_table.c | 1 - net/tipc/net.c | 1 - net/tipc/port.h | 145 -------------------------------------------------- net/tipc/socket.c | 123 ++++++++++++++++++++++++++++-------------- net/tipc/socket.h | 39 +++++++++++++- net/tipc/subscr.c | 1 - 9 files changed, 121 insertions(+), 193 deletions(-) delete mode 100644 net/tipc/port.h (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 9510fb2df566..b2bbe69b2554 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -37,7 +37,6 @@ #include "core.h" #include "link.h" -#include "port.h" #include "socket.h" #include "msg.h" #include "bcast.h" diff --git a/net/tipc/core.c b/net/tipc/core.c index 676d18015dd8..b3b03ef30df5 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -39,7 +39,7 @@ #include "name_table.h" #include "subscr.h" #include "config.h" -#include "port.h" +#include "socket.h" #include diff --git a/net/tipc/link.c b/net/tipc/link.c index 6c775a107a02..65410e18b8a6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -36,7 +36,6 @@ #include "core.h" #include "link.h" -#include "port.h" #include "socket.h" #include "name_distr.h" #include "discover.h" diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 9d7d37d95187..c058e30f84aa 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -39,7 +39,6 @@ #include "name_table.h" #include "name_distr.h" #include "subscr.h" -#include "port.h" #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ diff --git a/net/tipc/net.c b/net/tipc/net.c index 421dd89152ac..93b9944a6a8b 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -38,7 +38,6 @@ #include "net.h" #include "name_distr.h" #include "subscr.h" -#include "port.h" #include "socket.h" #include "node.h" #include "config.h" diff --git a/net/tipc/port.h b/net/tipc/port.h deleted file mode 100644 index 38bf8cb3df1a..000000000000 --- a/net/tipc/port.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * net/tipc/port.h: Include file for TIPC port code - * - * Copyright (c) 1994-2007, 2014, Ericsson AB - * Copyright (c) 2004-2007, 2010-2013, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _TIPC_PORT_H -#define _TIPC_PORT_H - -#include "net.h" -#include "msg.h" -#include "node_subscr.h" - -#define TIPC_CONNACK_INTV 256 -#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) -#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ - SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) - -/** - * struct tipc_port - TIPC port structure - * @lock: pointer to spinlock for controlling access to port - * @connected: non-zero if port is currently connected to a peer port - * @conn_type: TIPC type used when connection was established - * @conn_instance: TIPC instance used when connection was established - * @published: non-zero if port has one or more associated names - * @max_pkt: maximum packet size "hint" used when building messages sent by port - * @ref: unique reference to port in TIPC object registry - * @phdr: preformatted message header used when sending messages - * @port_list: adjacent ports in TIPC's global list of ports - * @publications: list of publications for port - * @pub_count: total # of publications port has made during its lifetime - * @probing_state: - * @probing_interval: - * @timer_ref: - */ -struct tipc_port { - int connected; - u32 conn_type; - u32 conn_instance; - int published; - u32 max_pkt; - u32 ref; - struct tipc_msg phdr; - struct list_head publications; - u32 pub_count; - u32 probing_state; - u32 probing_interval; - struct timer_list timer; -}; - -/* - * TIPC port manipulation routines - */ -u32 tipc_port_init(struct tipc_port *p_ptr, - const unsigned int importance); - -void tipc_port_destroy(struct tipc_port *p_ptr); - -int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *name_seq); - -int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *name_seq); - -int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); - -void tipc_port_reinit(void); - -static inline u32 tipc_port_peernode(struct tipc_port *p_ptr) -{ - return msg_destnode(&p_ptr->phdr); -} - -static inline u32 tipc_port_peerport(struct tipc_port *p_ptr) -{ - return msg_destport(&p_ptr->phdr); -} - -static inline bool tipc_port_unreliable(struct tipc_port *port) -{ - return msg_src_droppable(&port->phdr) != 0; -} - -static inline void tipc_port_set_unreliable(struct tipc_port *port, - bool unreliable) -{ - msg_set_src_droppable(&port->phdr, unreliable ? 1 : 0); -} - -static inline bool tipc_port_unreturnable(struct tipc_port *port) -{ - return msg_dest_droppable(&port->phdr) != 0; -} - -static inline void tipc_port_set_unreturnable(struct tipc_port *port, - bool unreturnable) -{ - msg_set_dest_droppable(&port->phdr, unreturnable ? 1 : 0); -} - - -static inline int tipc_port_importance(struct tipc_port *port) -{ - return msg_importance(&port->phdr); -} - -static inline int tipc_port_set_importance(struct tipc_port *port, int imp) -{ - if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&port->phdr, (u32)imp); - return 0; -} - -#endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ea33eab4fb9d..70eaceae1f8c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -36,12 +36,12 @@ #include "core.h" #include "ref.h" -#include "port.h" #include "name_table.h" #include "node.h" #include "link.h" #include #include "config.h" +#include "socket.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -114,24 +114,65 @@ static struct proto tipc_proto_kern; * - port reference */ -#include "socket.h" +static u32 tsk_peer_node(struct tipc_port *p_ptr) +{ + return msg_destnode(&p_ptr->phdr); +} + +static u32 tsk_peer_port(struct tipc_port *p_ptr) +{ + return msg_destport(&p_ptr->phdr); +} + +static bool tsk_unreliable(struct tipc_port *port) +{ + return msg_src_droppable(&port->phdr) != 0; +} + +static void tsk_set_unreliable(struct tipc_port *port, bool unreliable) +{ + msg_set_src_droppable(&port->phdr, unreliable ? 1 : 0); +} + +static bool tsk_unreturnable(struct tipc_port *port) +{ + return msg_dest_droppable(&port->phdr) != 0; +} + +static void tsk_set_unreturnable(struct tipc_port *port, bool unreturnable) +{ + msg_set_dest_droppable(&port->phdr, unreturnable ? 1 : 0); +} + +static int tsk_importance(struct tipc_port *port) +{ + return msg_importance(&port->phdr); +} + +static int tsk_set_importance(struct tipc_port *port, int imp) +{ + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + msg_set_importance(&port->phdr, (u32)imp); + return 0; +} /** - * advance_rx_queue - discard first buffer in socket receive queue + * tsk_advance_rx_queue - discard first buffer in socket receive queue * * Caller must hold socket lock */ -static void advance_rx_queue(struct sock *sk) +static void tsk_advance_rx_queue(struct sock *sk) { kfree_skb(__skb_dequeue(&sk->sk_receive_queue)); } /** - * reject_rx_queue - reject all buffers in socket receive queue + * tsk_rej_rx_queue - reject all buffers in socket receive queue * * Caller must hold socket lock */ -static void reject_rx_queue(struct sock *sk) +static void tsk_rej_rx_queue(struct sock *sk) { struct sk_buff *buf; u32 dnode; @@ -142,14 +183,14 @@ static void reject_rx_queue(struct sock *sk) } } -/* tipc_sk_peer_msg - verify if message was sent by connected port's peer +/* tsk_peer_msg - verify if message was sent by connected port's peer * * Handles cases where the node's network address has changed from * the default of <0.0.0> to its configured setting. */ -static bool tipc_sk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) +static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) { - u32 peer_port = tipc_port_peerport(&tsk->port); + u32 peer_port = tsk_peer_port(&tsk->port); u32 orig_node; u32 peer_node; @@ -160,7 +201,7 @@ static bool tipc_sk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) return false; orig_node = msg_orignode(msg); - peer_node = tipc_port_peernode(&tsk->port); + peer_node = tsk_peer_node(&tsk->port); if (likely(orig_node == peer_node)) return true; @@ -258,9 +299,9 @@ static int tipc_sk_create(struct net *net, struct socket *sock, atomic_set(&tsk->dupl_rcvcnt, 0); if (sock->state == SS_READY) { - tipc_port_set_unreturnable(port, true); + tsk_set_unreturnable(port, true); if (sock->type == SOCK_DGRAM) - tipc_port_set_unreliable(port, true); + tsk_set_unreliable(port, true); } return 0; } @@ -373,7 +414,7 @@ static int tipc_release(struct socket *sock) * Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer) */ - dnode = tipc_port_peernode(port); + dnode = tsk_peer_node(port); while (sock->state != SS_DISCONNECTING) { buf = __skb_dequeue(&sk->sk_receive_queue); if (buf == NULL) @@ -398,7 +439,7 @@ static int tipc_release(struct socket *sock) if (port->connected) { buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tipc_own_addr, - tipc_port_peerport(port), + tsk_peer_port(port), port->ref, TIPC_ERR_NO_PORT); if (buf) tipc_link_xmit(buf, dnode, port->ref); @@ -502,8 +543,8 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, if ((sock->state != SS_CONNECTED) && ((peer != 2) || (sock->state != SS_DISCONNECTING))) return -ENOTCONN; - addr->addr.id.ref = tipc_port_peerport(&tsk->port); - addr->addr.id.node = tipc_port_peernode(&tsk->port); + addr->addr.id.ref = tsk_peer_port(&tsk->port); + addr->addr.id.node = tsk_peer_node(&tsk->port); } else { addr->addr.id.ref = tsk->port.ref; addr->addr.id.node = tipc_own_addr; @@ -699,7 +740,7 @@ static int tipc_sk_proto_rcv(struct tipc_sock *tsk, u32 *dnode, int conn_cong; /* Ignore if connection cannot be validated: */ - if (!tipc_sk_peer_msg(tsk, msg)) + if (!tsk_peer_msg(tsk, msg)) goto exit; port->probing_state = TIPC_CONN_OK; @@ -986,7 +1027,7 @@ static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, } timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - dnode = tipc_port_peernode(port); + dnode = tsk_peer_node(port); next: mtu = port->max_pkt; @@ -1161,8 +1202,8 @@ static void tipc_sk_send_ack(struct tipc_port *port, uint ack) { struct sk_buff *buf = NULL; struct tipc_msg *msg; - u32 peer_port = tipc_port_peerport(port); - u32 dnode = tipc_port_peernode(port); + u32 peer_port = tsk_peer_port(port); + u32 dnode = tsk_peer_node(port); if (!port->connected) return; @@ -1260,7 +1301,7 @@ restart: /* Discard an empty non-errored message & try again */ if ((!sz) && (!err)) { - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); goto restart; } @@ -1298,7 +1339,7 @@ restart: tipc_sk_send_ack(port, tsk->rcv_unacked); tsk->rcv_unacked = 0; } - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); } exit: release_sock(sk); @@ -1360,7 +1401,7 @@ restart: /* Discard an empty non-errored message & try again */ if ((!sz) && (!err)) { - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); goto restart; } @@ -1409,7 +1450,7 @@ restart: tipc_sk_send_ack(port, tsk->rcv_unacked); tsk->rcv_unacked = 0; } - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); } /* Loop around if more data is required */ @@ -1480,12 +1521,12 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) switch ((int)sock->state) { case SS_CONNECTED: /* Accept only connection-based messages sent by peer */ - if (tipc_sk_peer_msg(tsk, msg)) { + if (tsk_peer_msg(tsk, msg)) { if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; port->connected = 0; /* let timer expire on it's own */ - tipc_node_remove_conn(tipc_port_peernode(port), + tipc_node_remove_conn(tsk_peer_node(port), port->ref); } retval = TIPC_OK; @@ -1919,13 +1960,13 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) * Reject any stray messages received by new socket * before the socket lock was taken (very, very unlikely) */ - reject_rx_queue(new_sk); + tsk_rej_rx_queue(new_sk); /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_port, msg_origport(msg), msg_orignode(msg)); new_sock->state = SS_CONNECTED; - tipc_port_set_importance(new_port, msg_importance(msg)); + tsk_set_importance(new_port, msg_importance(msg)); if (msg_named(msg)) { new_port->conn_type = msg_nametype(msg); new_port->conn_instance = msg_nameinst(msg); @@ -1938,7 +1979,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) if (!msg_data_sz(msg)) { struct msghdr m = {NULL,}; - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); tipc_send_packet(NULL, new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); @@ -1990,11 +2031,11 @@ restart: tipc_link_xmit(buf, dnode, port->ref); tipc_node_remove_conn(dnode, port->ref); } else { - dnode = tipc_port_peernode(port); + dnode = tsk_peer_node(port); buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tipc_own_addr, - tipc_port_peerport(port), + tsk_peer_port(port), port->ref, TIPC_CONN_SHUTDOWN); tipc_link_xmit(buf, dnode, port->ref); } @@ -2040,8 +2081,8 @@ static void tipc_sk_timeout(unsigned long ref) bh_unlock_sock(sk); goto exit; } - peer_port = tipc_port_peerport(port); - peer_node = tipc_port_peernode(port); + peer_port = tsk_peer_port(port); + peer_node = tsk_peer_node(port); if (port->probing_state == TIPC_CONN_PROBING) { /* Previous probe not answered -> self abort */ @@ -2132,8 +2173,8 @@ static int tipc_sk_show(struct tipc_port *port, char *buf, ret = tipc_snprintf(buf, len, "%-10u:", port->ref); if (port->connected) { - u32 dport = tipc_port_peerport(port); - u32 destnode = tipc_port_peernode(port); + u32 dport = tsk_peer_port(port); + u32 destnode = tsk_peer_node(port); ret += tipc_snprintf(buf + ret, len - ret, " connected to <%u.%u.%u:%u>", @@ -2248,16 +2289,16 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - res = tipc_port_set_importance(port, value); + res = tsk_set_importance(port, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) - tipc_port_set_unreliable(port, value); + tsk_set_unreliable(port, value); else res = -ENOPROTOOPT; break; case TIPC_DEST_DROPPABLE: - tipc_port_set_unreturnable(port, value); + tsk_set_unreturnable(port, value); break; case TIPC_CONN_TIMEOUT: tipc_sk(sk)->conn_timeout = value; @@ -2307,13 +2348,13 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - value = tipc_port_importance(port); + value = tsk_importance(port); break; case TIPC_SRC_DROPPABLE: - value = tipc_port_unreliable(port); + value = tsk_unreliable(port); break; case TIPC_DEST_DROPPABLE: - value = tipc_port_unreturnable(port); + value = tsk_unreturnable(port); break; case TIPC_CONN_TIMEOUT: value = tipc_sk(sk)->conn_timeout; diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 5d515be604a9..b98725e27b94 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -35,11 +35,48 @@ #ifndef _TIPC_SOCK_H #define _TIPC_SOCK_H -#include "port.h" #include +#include "msg.h" #define TIPC_CONN_OK 0 #define TIPC_CONN_PROBING 1 +#define TIPC_CONNACK_INTV 256 +#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) +#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ + SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) + +/** + * struct tipc_port - TIPC port structure + * @lock: pointer to spinlock for controlling access to port + * @connected: non-zero if port is currently connected to a peer port + * @conn_type: TIPC type used when connection was established + * @conn_instance: TIPC instance used when connection was established + * @published: non-zero if port has one or more associated names + * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @ref: unique reference to port in TIPC object registry + * @phdr: preformatted message header used when sending messages + * @port_list: adjacent ports in TIPC's global list of ports + * @publications: list of publications for port + * @pub_count: total # of publications port has made during its lifetime + * @probing_state: + * @probing_interval: + * @timer_ref: + */ +struct tipc_port { + int connected; + u32 conn_type; + u32 conn_instance; + int published; + u32 max_pkt; + u32 ref; + struct tipc_msg phdr; + struct list_head port_list; + struct list_head publications; + u32 pub_count; + u32 probing_state; + u32 probing_interval; + struct timer_list timer; +}; /** * struct tipc_sock - TIPC socket structure diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 642437231ad5..31b5cb232a43 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -36,7 +36,6 @@ #include "core.h" #include "name_table.h" -#include "port.h" #include "subscr.h" /** -- cgit v1.2.3 From 808d90f9c55943c2965d33b7156e559c59dd2db9 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:19 -0400 Subject: tipc: remove files ref.h and ref.c The reference table is now 'socket aware' instead of being generic, and has in reality become a socket internal table. In order to be able to minimize the API exposed by the socket layer towards the rest of the stack, we now move the reference table definitions and functions into the file socket.c, and rename the functions accordingly. There are no functional changes in this commit. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/Makefile | 2 +- net/tipc/core.c | 7 +- net/tipc/ref.c | 270 ------------------------------------------------------ net/tipc/ref.h | 56 ----------- net/tipc/socket.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++- net/tipc/socket.h | 2 + 6 files changed, 250 insertions(+), 334 deletions(-) delete mode 100644 net/tipc/ref.c delete mode 100644 net/tipc/ref.h (limited to 'net') diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 5206a4440222..b8a13caad59a 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o config.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o name_table.o net.o \ - netlink.o node.o node_subscr.o ref.o \ + netlink.o node.o node_subscr.o \ socket.o log.o eth_media.o server.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/core.c b/net/tipc/core.c index b3b03ef30df5..a5737b8407dd 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -35,7 +35,6 @@ */ #include "core.h" -#include "ref.h" #include "name_table.h" #include "subscr.h" #include "config.h" @@ -85,7 +84,7 @@ static void tipc_core_stop(void) tipc_netlink_stop(); tipc_subscr_stop(); tipc_nametbl_stop(); - tipc_ref_table_stop(); + tipc_sk_ref_table_stop(); tipc_socket_stop(); tipc_unregister_sysctl(); } @@ -99,7 +98,7 @@ static int tipc_core_start(void) get_random_bytes(&tipc_random, sizeof(tipc_random)); - err = tipc_ref_table_init(tipc_max_ports, tipc_random); + err = tipc_sk_ref_table_init(tipc_max_ports, tipc_random); if (err) goto out_reftbl; @@ -139,7 +138,7 @@ out_socket: out_netlink: tipc_nametbl_stop(); out_nametbl: - tipc_ref_table_stop(); + tipc_sk_ref_table_stop(); out_reftbl: return err; } diff --git a/net/tipc/ref.c b/net/tipc/ref.c deleted file mode 100644 index ea981bed967b..000000000000 --- a/net/tipc/ref.c +++ /dev/null @@ -1,270 +0,0 @@ -/* - * net/tipc/ref.c: TIPC socket registry code - * - * Copyright (c) 1991-2006, 2014, Ericsson AB - * Copyright (c) 2004-2007, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "core.h" -#include "ref.h" - -/** - * struct reference - TIPC socket reference entry - * @tsk: pointer to socket associated with reference entry - * @ref: reference value for socket (combines instance & array index info) - */ -struct reference { - struct tipc_sock *tsk; - u32 ref; -}; - -/** - * struct tipc_ref_table - table of TIPC socket reference entries - * @entries: pointer to array of reference entries - * @capacity: array index of first unusable entry - * @init_point: array index of first uninitialized entry - * @first_free: array index of first unused socket reference entry - * @last_free: array index of last unused socket reference entry - * @index_mask: bitmask for array index portion of reference values - * @start_mask: initial value for instance value portion of reference values - */ -struct ref_table { - struct reference *entries; - u32 capacity; - u32 init_point; - u32 first_free; - u32 last_free; - u32 index_mask; - u32 start_mask; -}; - -/* - * Socket reference table consists of 2**N entries. - * - * State Socket ptr Reference - * ----- ---------- --------- - * In use non-NULL XXXX|own index - * (XXXX changes each time entry is acquired) - * Free NULL YYYY|next free index - * (YYYY is one more than last used XXXX) - * Uninitialized NULL 0 - * - * Entry 0 is not used; this allows index 0 to denote the end of the free list. - * - * Note that a reference value of 0 does not necessarily indicate that an - * entry is uninitialized, since the last entry in the free list could also - * have a reference value of 0 (although this is unlikely). - */ - -static struct ref_table tipc_ref_table; - -static DEFINE_RWLOCK(ref_table_lock); - -/** - * tipc_ref_table_init - create reference table for sockets - */ -int tipc_ref_table_init(u32 requested_size, u32 start) -{ - struct reference *table; - u32 actual_size; - - /* account for unused entry, then round up size to a power of 2 */ - - requested_size++; - for (actual_size = 16; actual_size < requested_size; actual_size <<= 1) - /* do nothing */ ; - - /* allocate table & mark all entries as uninitialized */ - table = vzalloc(actual_size * sizeof(struct reference)); - if (table == NULL) - return -ENOMEM; - - tipc_ref_table.entries = table; - tipc_ref_table.capacity = requested_size; - tipc_ref_table.init_point = 1; - tipc_ref_table.first_free = 0; - tipc_ref_table.last_free = 0; - tipc_ref_table.index_mask = actual_size - 1; - tipc_ref_table.start_mask = start & ~tipc_ref_table.index_mask; - - return 0; -} - -/** - * tipc_ref_table_stop - destroy reference table for sockets - */ -void tipc_ref_table_stop(void) -{ - if (!tipc_ref_table.entries) - return; - vfree(tipc_ref_table.entries); - tipc_ref_table.entries = NULL; -} - -/* tipc_ref_acquire - create reference to a socket - * - * Register an socket pointer in the reference table. - * Returns a unique reference value that is used from then on to retrieve the - * socket pointer, or to determine if the socket has been deregistered. - */ -u32 tipc_ref_acquire(struct tipc_sock *tsk) -{ - u32 index; - u32 index_mask; - u32 next_plus_upper; - u32 ref = 0; - struct reference *entry; - - if (unlikely(!tsk)) { - pr_err("Attempt to acquire ref. to non-existent obj\n"); - return 0; - } - if (unlikely(!tipc_ref_table.entries)) { - pr_err("Ref. table not found in acquisition attempt\n"); - return 0; - } - - /* Take a free entry, if available; otherwise initialize a new one */ - write_lock_bh(&ref_table_lock); - index = tipc_ref_table.first_free; - entry = &tipc_ref_table.entries[index]; - - if (likely(index)) { - index = tipc_ref_table.first_free; - entry = &(tipc_ref_table.entries[index]); - index_mask = tipc_ref_table.index_mask; - next_plus_upper = entry->ref; - tipc_ref_table.first_free = next_plus_upper & index_mask; - ref = (next_plus_upper & ~index_mask) + index; - entry->tsk = tsk; - } else if (tipc_ref_table.init_point < tipc_ref_table.capacity) { - index = tipc_ref_table.init_point++; - entry = &(tipc_ref_table.entries[index]); - ref = tipc_ref_table.start_mask + index; - } - - if (ref) { - entry->ref = ref; - entry->tsk = tsk; - } - write_unlock_bh(&ref_table_lock); - return ref; -} - -/* tipc_ref_discard - invalidate reference to an socket - * - * Disallow future references to an socket and free up the entry for re-use. - */ -void tipc_ref_discard(u32 ref) -{ - struct reference *entry; - u32 index; - u32 index_mask; - - if (unlikely(!tipc_ref_table.entries)) { - pr_err("Ref. table not found during discard attempt\n"); - return; - } - - index_mask = tipc_ref_table.index_mask; - index = ref & index_mask; - entry = &(tipc_ref_table.entries[index]); - - write_lock_bh(&ref_table_lock); - - if (unlikely(!entry->tsk)) { - pr_err("Attempt to discard ref. to non-existent socket\n"); - goto exit; - } - if (unlikely(entry->ref != ref)) { - pr_err("Attempt to discard non-existent reference\n"); - goto exit; - } - - /* - * Mark entry as unused; increment instance part of entry's reference - * to invalidate any subsequent references - */ - entry->tsk = NULL; - entry->ref = (ref & ~index_mask) + (index_mask + 1); - - /* Append entry to free entry list */ - if (unlikely(tipc_ref_table.first_free == 0)) - tipc_ref_table.first_free = index; - else - tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index; - tipc_ref_table.last_free = index; -exit: - write_unlock_bh(&ref_table_lock); -} - -/* tipc_sk_get - find referenced socket and return pointer to it - */ -struct tipc_sock *tipc_sk_get(u32 ref) -{ - struct reference *entry; - struct tipc_sock *tsk; - - if (unlikely(!tipc_ref_table.entries)) - return NULL; - read_lock_bh(&ref_table_lock); - entry = &tipc_ref_table.entries[ref & tipc_ref_table.index_mask]; - tsk = entry->tsk; - if (likely(tsk && (entry->ref == ref))) - sock_hold(&tsk->sk); - else - tsk = NULL; - read_unlock_bh(&ref_table_lock); - return tsk; -} - -/* tipc_sk_get_next - lock & return next socket after referenced one -*/ -struct tipc_sock *tipc_sk_get_next(u32 *ref) -{ - struct reference *entry; - struct tipc_sock *tsk = NULL; - uint index = *ref & tipc_ref_table.index_mask; - - read_lock_bh(&ref_table_lock); - while (++index < tipc_ref_table.capacity) { - entry = &tipc_ref_table.entries[index]; - if (!entry->tsk) - continue; - tsk = entry->tsk; - sock_hold(&tsk->sk); - *ref = entry->ref; - break; - } - read_unlock_bh(&ref_table_lock); - return tsk; -} diff --git a/net/tipc/ref.h b/net/tipc/ref.h deleted file mode 100644 index 2b75a892305a..000000000000 --- a/net/tipc/ref.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * net/tipc/ref.h: Include file for TIPC object registry code - * - * Copyright (c) 1991-2006, 2014, Ericsson AB - * Copyright (c) 2005-2006, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _TIPC_REF_H -#define _TIPC_REF_H - -#include "socket.h" - -int tipc_ref_table_init(u32 requested_size, u32 start); -void tipc_ref_table_stop(void); - -u32 tipc_ref_acquire(struct tipc_sock *tsk); -void tipc_ref_discard(u32 ref); - -struct tipc_sock *tipc_sk_get(u32 ref); -struct tipc_sock *tipc_sk_get_next(u32 *ref); - -static inline void tipc_sk_put(struct tipc_sock *tsk) -{ - sock_put(&tsk->sk); -} - -#endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 70eaceae1f8c..6a699671245b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -35,7 +35,6 @@ */ #include "core.h" -#include "ref.h" #include "name_table.h" #include "node.h" #include "link.h" @@ -61,6 +60,11 @@ static int tipc_sk_publish(struct tipc_port *port, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_port *port, uint scope, struct tipc_name_seq const *seq); +static u32 tipc_sk_ref_acquire(struct tipc_sock *tsk); +static void tipc_sk_ref_discard(u32 ref); +static struct tipc_sock *tipc_sk_get(u32 ref); +static struct tipc_sock *tipc_sk_get_next(u32 *ref); +static void tipc_sk_put(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -271,7 +275,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); port = &tsk->port; - ref = tipc_ref_acquire(tsk); + ref = tipc_sk_ref_acquire(tsk); if (!ref) { pr_warn("Socket create failed; reference table exhausted\n"); return -ENOMEM; @@ -434,7 +438,7 @@ static int tipc_release(struct socket *sock) } tipc_sk_withdraw(port, 0, NULL); - tipc_ref_discard(port->ref); + tipc_sk_ref_discard(port->ref); k_cancel_timer(&port->timer); if (port->connected) { buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -2253,6 +2257,243 @@ void tipc_sk_reinit(void) } } +/** + * struct reference - TIPC socket reference entry + * @tsk: pointer to socket associated with reference entry + * @ref: reference value for socket (combines instance & array index info) + */ +struct reference { + struct tipc_sock *tsk; + u32 ref; +}; + +/** + * struct tipc_ref_table - table of TIPC socket reference entries + * @entries: pointer to array of reference entries + * @capacity: array index of first unusable entry + * @init_point: array index of first uninitialized entry + * @first_free: array index of first unused socket reference entry + * @last_free: array index of last unused socket reference entry + * @index_mask: bitmask for array index portion of reference values + * @start_mask: initial value for instance value portion of reference values + */ +struct ref_table { + struct reference *entries; + u32 capacity; + u32 init_point; + u32 first_free; + u32 last_free; + u32 index_mask; + u32 start_mask; +}; + +/* Socket reference table consists of 2**N entries. + * + * State Socket ptr Reference + * ----- ---------- --------- + * In use non-NULL XXXX|own index + * (XXXX changes each time entry is acquired) + * Free NULL YYYY|next free index + * (YYYY is one more than last used XXXX) + * Uninitialized NULL 0 + * + * Entry 0 is not used; this allows index 0 to denote the end of the free list. + * + * Note that a reference value of 0 does not necessarily indicate that an + * entry is uninitialized, since the last entry in the free list could also + * have a reference value of 0 (although this is unlikely). + */ + +static struct ref_table tipc_ref_table; + +static DEFINE_RWLOCK(ref_table_lock); + +/** + * tipc_ref_table_init - create reference table for sockets + */ +int tipc_sk_ref_table_init(u32 req_sz, u32 start) +{ + struct reference *table; + u32 actual_sz; + + /* account for unused entry, then round up size to a power of 2 */ + + req_sz++; + for (actual_sz = 16; actual_sz < req_sz; actual_sz <<= 1) { + /* do nothing */ + }; + + /* allocate table & mark all entries as uninitialized */ + table = vzalloc(actual_sz * sizeof(struct reference)); + if (table == NULL) + return -ENOMEM; + + tipc_ref_table.entries = table; + tipc_ref_table.capacity = req_sz; + tipc_ref_table.init_point = 1; + tipc_ref_table.first_free = 0; + tipc_ref_table.last_free = 0; + tipc_ref_table.index_mask = actual_sz - 1; + tipc_ref_table.start_mask = start & ~tipc_ref_table.index_mask; + + return 0; +} + +/** + * tipc_ref_table_stop - destroy reference table for sockets + */ +void tipc_sk_ref_table_stop(void) +{ + if (!tipc_ref_table.entries) + return; + vfree(tipc_ref_table.entries); + tipc_ref_table.entries = NULL; +} + +/* tipc_ref_acquire - create reference to a socket + * + * Register an socket pointer in the reference table. + * Returns a unique reference value that is used from then on to retrieve the + * socket pointer, or to determine if the socket has been deregistered. + */ +u32 tipc_sk_ref_acquire(struct tipc_sock *tsk) +{ + u32 index; + u32 index_mask; + u32 next_plus_upper; + u32 ref = 0; + struct reference *entry; + + if (unlikely(!tsk)) { + pr_err("Attempt to acquire ref. to non-existent obj\n"); + return 0; + } + if (unlikely(!tipc_ref_table.entries)) { + pr_err("Ref. table not found in acquisition attempt\n"); + return 0; + } + + /* Take a free entry, if available; otherwise initialize a new one */ + write_lock_bh(&ref_table_lock); + index = tipc_ref_table.first_free; + entry = &tipc_ref_table.entries[index]; + + if (likely(index)) { + index = tipc_ref_table.first_free; + entry = &tipc_ref_table.entries[index]; + index_mask = tipc_ref_table.index_mask; + next_plus_upper = entry->ref; + tipc_ref_table.first_free = next_plus_upper & index_mask; + ref = (next_plus_upper & ~index_mask) + index; + entry->tsk = tsk; + } else if (tipc_ref_table.init_point < tipc_ref_table.capacity) { + index = tipc_ref_table.init_point++; + entry = &tipc_ref_table.entries[index]; + ref = tipc_ref_table.start_mask + index; + } + + if (ref) { + entry->ref = ref; + entry->tsk = tsk; + } + write_unlock_bh(&ref_table_lock); + return ref; +} + +/* tipc_sk_ref_discard - invalidate reference to an socket + * + * Disallow future references to an socket and free up the entry for re-use. + */ +void tipc_sk_ref_discard(u32 ref) +{ + struct reference *entry; + u32 index; + u32 index_mask; + + if (unlikely(!tipc_ref_table.entries)) { + pr_err("Ref. table not found during discard attempt\n"); + return; + } + + index_mask = tipc_ref_table.index_mask; + index = ref & index_mask; + entry = &tipc_ref_table.entries[index]; + + write_lock_bh(&ref_table_lock); + + if (unlikely(!entry->tsk)) { + pr_err("Attempt to discard ref. to non-existent socket\n"); + goto exit; + } + if (unlikely(entry->ref != ref)) { + pr_err("Attempt to discard non-existent reference\n"); + goto exit; + } + + /* Mark entry as unused; increment instance part of entry's + * reference to invalidate any subsequent references + */ + + entry->tsk = NULL; + entry->ref = (ref & ~index_mask) + (index_mask + 1); + + /* Append entry to free entry list */ + if (unlikely(tipc_ref_table.first_free == 0)) + tipc_ref_table.first_free = index; + else + tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index; + tipc_ref_table.last_free = index; +exit: + write_unlock_bh(&ref_table_lock); +} + +/* tipc_sk_get - find referenced socket and return pointer to it + */ +struct tipc_sock *tipc_sk_get(u32 ref) +{ + struct reference *entry; + struct tipc_sock *tsk; + + if (unlikely(!tipc_ref_table.entries)) + return NULL; + read_lock_bh(&ref_table_lock); + entry = &tipc_ref_table.entries[ref & tipc_ref_table.index_mask]; + tsk = entry->tsk; + if (likely(tsk && (entry->ref == ref))) + sock_hold(&tsk->sk); + else + tsk = NULL; + read_unlock_bh(&ref_table_lock); + return tsk; +} + +/* tipc_sk_get_next - lock & return next socket after referenced one +*/ +struct tipc_sock *tipc_sk_get_next(u32 *ref) +{ + struct reference *entry; + struct tipc_sock *tsk = NULL; + uint index = *ref & tipc_ref_table.index_mask; + + read_lock_bh(&ref_table_lock); + while (++index < tipc_ref_table.capacity) { + entry = &tipc_ref_table.entries[index]; + if (!entry->tsk) + continue; + tsk = entry->tsk; + sock_hold(&tsk->sk); + *ref = entry->ref; + break; + } + read_unlock_bh(&ref_table_lock); + return tsk; +} + +static void tipc_sk_put(struct tipc_sock *tsk) +{ + sock_put(&tsk->sk); +} + /** * tipc_setsockopt - set socket option * @sock: socket structure diff --git a/net/tipc/socket.h b/net/tipc/socket.h index b98725e27b94..48772169bc77 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -119,5 +119,7 @@ int tipc_sk_rcv(struct sk_buff *buf); struct sk_buff *tipc_sk_socks_show(void); void tipc_sk_mcast_rcv(struct sk_buff *buf); void tipc_sk_reinit(void); +int tipc_sk_ref_table_init(u32 requested_size, u32 start); +void tipc_sk_ref_table_stop(void); #endif -- cgit v1.2.3 From 301bae56f21295a4ba71367818d80735687f11ac Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 22 Aug 2014 18:09:20 -0400 Subject: tipc: merge struct tipc_port into struct tipc_sock We complete the merging of the port and socket layer by aggregating the fields of struct tipc_port directly into struct tipc_sock, and moving the combined structure into socket.c. We also move all functions and macros that are not any longer exposed to the rest of the stack into socket.c, and rename them accordingly. Despite the size of this commit, there are no functional changes. We have only made such changes that are necessary due of the removal of struct tipc_port. Signed-off-by: Jon Maloy Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/socket.c | 373 ++++++++++++++++++++++++++++++------------------------ net/tipc/socket.h | 74 ----------- 2 files changed, 206 insertions(+), 241 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 6a699671245b..d416e83ce069 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -45,9 +45,57 @@ #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ -#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTERVAL 3600000 /* [ms] => 1 h */ -#define TIPC_FWD_MSG 1 +#define TIPC_FWD_MSG 1 +#define TIPC_CONN_OK 0 +#define TIPC_CONN_PROBING 1 + +/** + * struct tipc_sock - TIPC socket structure + * @sk: socket - interacts with 'port' and with user via the socket API + * @connected: non-zero if port is currently connected to a peer port + * @conn_type: TIPC type used when connection was established + * @conn_instance: TIPC instance used when connection was established + * @published: non-zero if port has one or more associated names + * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @ref: unique reference to port in TIPC object registry + * @phdr: preformatted message header used when sending messages + * @port_list: adjacent ports in TIPC's global list of ports + * @publications: list of publications for port + * @pub_count: total # of publications port has made during its lifetime + * @probing_state: + * @probing_interval: + * @timer: + * @port: port - interacts with 'sk' and with the rest of the TIPC stack + * @peer_name: the peer of the connection, if any + * @conn_timeout: the time we can wait for an unresponded setup request + * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue + * @link_cong: non-zero if owner must sleep because of link congestion + * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @rcv_unacked: # messages read by user, but not yet acked back to peer + */ +struct tipc_sock { + struct sock sk; + int connected; + u32 conn_type; + u32 conn_instance; + int published; + u32 max_pkt; + u32 ref; + struct tipc_msg phdr; + struct list_head sock_list; + struct list_head publications; + u32 pub_count; + u32 probing_state; + u32 probing_interval; + struct timer_list timer; + uint conn_timeout; + atomic_t dupl_rcvcnt; + bool link_cong; + uint sent_unacked; + uint rcv_unacked; +}; static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); @@ -56,9 +104,9 @@ static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); static void tipc_sk_timeout(unsigned long ref); -static int tipc_sk_publish(struct tipc_port *port, uint scope, +static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); -static int tipc_sk_withdraw(struct tipc_port *port, uint scope, +static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static u32 tipc_sk_ref_acquire(struct tipc_sock *tsk); static void tipc_sk_ref_discard(u32 ref); @@ -118,49 +166,59 @@ static struct proto tipc_proto_kern; * - port reference */ -static u32 tsk_peer_node(struct tipc_port *p_ptr) +static u32 tsk_peer_node(struct tipc_sock *tsk) { - return msg_destnode(&p_ptr->phdr); + return msg_destnode(&tsk->phdr); } -static u32 tsk_peer_port(struct tipc_port *p_ptr) +static u32 tsk_peer_port(struct tipc_sock *tsk) { - return msg_destport(&p_ptr->phdr); + return msg_destport(&tsk->phdr); } -static bool tsk_unreliable(struct tipc_port *port) +static bool tsk_unreliable(struct tipc_sock *tsk) { - return msg_src_droppable(&port->phdr) != 0; + return msg_src_droppable(&tsk->phdr) != 0; } -static void tsk_set_unreliable(struct tipc_port *port, bool unreliable) +static void tsk_set_unreliable(struct tipc_sock *tsk, bool unreliable) { - msg_set_src_droppable(&port->phdr, unreliable ? 1 : 0); + msg_set_src_droppable(&tsk->phdr, unreliable ? 1 : 0); } -static bool tsk_unreturnable(struct tipc_port *port) +static bool tsk_unreturnable(struct tipc_sock *tsk) { - return msg_dest_droppable(&port->phdr) != 0; + return msg_dest_droppable(&tsk->phdr) != 0; } -static void tsk_set_unreturnable(struct tipc_port *port, bool unreturnable) +static void tsk_set_unreturnable(struct tipc_sock *tsk, bool unreturnable) { - msg_set_dest_droppable(&port->phdr, unreturnable ? 1 : 0); + msg_set_dest_droppable(&tsk->phdr, unreturnable ? 1 : 0); } -static int tsk_importance(struct tipc_port *port) +static int tsk_importance(struct tipc_sock *tsk) { - return msg_importance(&port->phdr); + return msg_importance(&tsk->phdr); } -static int tsk_set_importance(struct tipc_port *port, int imp) +static int tsk_set_importance(struct tipc_sock *tsk, int imp) { if (imp > TIPC_CRITICAL_IMPORTANCE) return -EINVAL; - msg_set_importance(&port->phdr, (u32)imp); + msg_set_importance(&tsk->phdr, (u32)imp); return 0; } +static struct tipc_sock *tipc_sk(const struct sock *sk) +{ + return container_of(sk, struct tipc_sock, sk); +} + +static int tsk_conn_cong(struct tipc_sock *tsk) +{ + return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; +} + /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * @@ -194,18 +252,18 @@ static void tsk_rej_rx_queue(struct sock *sk) */ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) { - u32 peer_port = tsk_peer_port(&tsk->port); + u32 peer_port = tsk_peer_port(tsk); u32 orig_node; u32 peer_node; - if (unlikely(!tsk->port.connected)) + if (unlikely(!tsk->connected)) return false; if (unlikely(msg_origport(msg) != peer_port)) return false; orig_node = msg_orignode(msg); - peer_node = tsk_peer_node(&tsk->port); + peer_node = tsk_peer_node(tsk); if (likely(orig_node == peer_node)) return true; @@ -238,7 +296,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, socket_state state; struct sock *sk; struct tipc_sock *tsk; - struct tipc_port *port; struct tipc_msg *msg; u32 ref; @@ -274,17 +331,15 @@ static int tipc_sk_create(struct net *net, struct socket *sock, return -ENOMEM; tsk = tipc_sk(sk); - port = &tsk->port; ref = tipc_sk_ref_acquire(tsk); if (!ref) { pr_warn("Socket create failed; reference table exhausted\n"); return -ENOMEM; } - port->max_pkt = MAX_PKT_DEFAULT; - port->ref = ref; - INIT_LIST_HEAD(&port->publications); - - msg = &port->phdr; + tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->ref = ref; + INIT_LIST_HEAD(&tsk->publications); + msg = &tsk->phdr; tipc_msg_init(msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, NAMED_H_SIZE, 0); msg_set_origport(msg, ref); @@ -293,7 +348,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sock->ops = ops; sock->state = state; sock_init_data(sock, sk); - k_init_timer(&port->timer, (Handler)tipc_sk_timeout, ref); + k_init_timer(&tsk->timer, (Handler)tipc_sk_timeout, ref); sk->sk_backlog_rcv = tipc_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; @@ -303,9 +358,9 @@ static int tipc_sk_create(struct net *net, struct socket *sock, atomic_set(&tsk->dupl_rcvcnt, 0); if (sock->state == SS_READY) { - tsk_set_unreturnable(port, true); + tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) - tsk_set_unreliable(port, true); + tsk_set_unreliable(tsk, true); } return 0; } @@ -399,7 +454,6 @@ static int tipc_release(struct socket *sock) { struct sock *sk = sock->sk; struct tipc_sock *tsk; - struct tipc_port *port; struct sk_buff *buf; u32 dnode; @@ -411,14 +465,13 @@ static int tipc_release(struct socket *sock) return 0; tsk = tipc_sk(sk); - port = &tsk->port; lock_sock(sk); /* * Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer) */ - dnode = tsk_peer_node(port); + dnode = tsk_peer_node(tsk); while (sock->state != SS_DISCONNECTING) { buf = __skb_dequeue(&sk->sk_receive_queue); if (buf == NULL) @@ -429,27 +482,27 @@ static int tipc_release(struct socket *sock) if ((sock->state == SS_CONNECTING) || (sock->state == SS_CONNECTED)) { sock->state = SS_DISCONNECTING; - port->connected = 0; - tipc_node_remove_conn(dnode, port->ref); + tsk->connected = 0; + tipc_node_remove_conn(dnode, tsk->ref); } if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) tipc_link_xmit(buf, dnode, 0); } } - tipc_sk_withdraw(port, 0, NULL); - tipc_sk_ref_discard(port->ref); - k_cancel_timer(&port->timer); - if (port->connected) { + tipc_sk_withdraw(tsk, 0, NULL); + tipc_sk_ref_discard(tsk->ref); + k_cancel_timer(&tsk->timer); + if (tsk->connected) { buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tipc_own_addr, - tsk_peer_port(port), - port->ref, TIPC_ERR_NO_PORT); + tsk_peer_port(tsk), + tsk->ref, TIPC_ERR_NO_PORT); if (buf) - tipc_link_xmit(buf, dnode, port->ref); - tipc_node_remove_conn(dnode, port->ref); + tipc_link_xmit(buf, dnode, tsk->ref); + tipc_node_remove_conn(dnode, tsk->ref); } - k_term_timer(&port->timer); + k_term_timer(&tsk->timer); /* Discard any remaining (connection-based) messages in receive queue */ __skb_queue_purge(&sk->sk_receive_queue); @@ -488,7 +541,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, lock_sock(sk); if (unlikely(!uaddr_len)) { - res = tipc_sk_withdraw(&tsk->port, 0, NULL); + res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } @@ -516,8 +569,8 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, } res = (addr->scope > 0) ? - tipc_sk_publish(&tsk->port, addr->scope, &addr->addr.nameseq) : - tipc_sk_withdraw(&tsk->port, -addr->scope, &addr->addr.nameseq); + tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : + tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); exit: release_sock(sk); return res; @@ -547,10 +600,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, if ((sock->state != SS_CONNECTED) && ((peer != 2) || (sock->state != SS_DISCONNECTING))) return -ENOTCONN; - addr->addr.id.ref = tsk_peer_port(&tsk->port); - addr->addr.id.node = tsk_peer_node(&tsk->port); + addr->addr.id.ref = tsk_peer_port(tsk); + addr->addr.id.node = tsk_peer_node(tsk); } else { - addr->addr.id.ref = tsk->port.ref; + addr->addr.id.ref = tsk->ref; addr->addr.id.node = tipc_own_addr; } @@ -619,7 +672,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, break; case SS_READY: case SS_CONNECTED: - if (!tsk->link_cong && !tipc_sk_conn_cong(tsk)) + if (!tsk->link_cong && !tsk_conn_cong(tsk)) mask |= POLLOUT; /* fall thru' */ case SS_CONNECTING: @@ -650,7 +703,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct iovec *iov, size_t dsz, long timeo) { struct sock *sk = sock->sk; - struct tipc_msg *mhdr = &tipc_sk(sk)->port.phdr; + struct tipc_msg *mhdr = &tipc_sk(sk)->phdr; struct sk_buff *buf; uint mtu; int rc; @@ -740,17 +793,16 @@ static int tipc_sk_proto_rcv(struct tipc_sock *tsk, u32 *dnode, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); - struct tipc_port *port = &tsk->port; int conn_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, msg)) goto exit; - port->probing_state = TIPC_CONN_OK; + tsk->probing_state = TIPC_CONN_OK; if (msg_type(msg) == CONN_ACK) { - conn_cong = tipc_sk_conn_cong(tsk); + conn_cong = tsk_conn_cong(tsk); tsk->sent_unacked -= msg_msgcnt(msg); if (conn_cong) tsk->sk.sk_write_space(&tsk->sk); @@ -844,8 +896,7 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; - struct tipc_msg *mhdr = &port->phdr; + struct tipc_msg *mhdr = &tsk->phdr; struct iovec *iov = m->msg_iov; u32 dnode, dport; struct sk_buff *buf; @@ -876,13 +927,13 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, rc = -EISCONN; goto exit; } - if (tsk->port.published) { + if (tsk->published) { rc = -EOPNOTSUPP; goto exit; } if (dest->addrtype == TIPC_ADDR_NAME) { - tsk->port.conn_type = dest->addr.name.name.type; - tsk->port.conn_instance = dest->addr.name.name.instance; + tsk->conn_type = dest->addr.name.name.type; + tsk->conn_instance = dest->addr.name.name.instance; } } rc = dest_name_check(dest, m); @@ -922,14 +973,14 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, } new_mtu: - mtu = tipc_node_get_mtu(dnode, tsk->port.ref); + mtu = tipc_node_get_mtu(dnode, tsk->ref); rc = tipc_msg_build(mhdr, iov, 0, dsz, mtu, &buf); if (rc < 0) goto exit; do { TIPC_SKB_CB(buf)->wakeup_pending = tsk->link_cong; - rc = tipc_link_xmit(buf, dnode, tsk->port.ref); + rc = tipc_link_xmit(buf, dnode, tsk->ref); if (likely(rc >= 0)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; @@ -975,8 +1026,8 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); done = sk_wait_event(sk, timeo_p, (!tsk->link_cong && - !tipc_sk_conn_cong(tsk)) || - !tsk->port.connected); + !tsk_conn_cong(tsk)) || + !tsk->connected); finish_wait(sk_sleep(sk), &wait); } while (!done); return 0; @@ -999,11 +1050,10 @@ static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; - struct tipc_msg *mhdr = &port->phdr; + struct tipc_msg *mhdr = &tsk->phdr; struct sk_buff *buf; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - u32 ref = port->ref; + u32 ref = tsk->ref; int rc = -EINVAL; long timeo; u32 dnode; @@ -1031,16 +1081,16 @@ static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, } timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - dnode = tsk_peer_node(port); + dnode = tsk_peer_node(tsk); next: - mtu = port->max_pkt; + mtu = tsk->max_pkt; send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); rc = tipc_msg_build(mhdr, m->msg_iov, sent, send, mtu, &buf); if (unlikely(rc < 0)) goto exit; do { - if (likely(!tipc_sk_conn_cong(tsk))) { + if (likely(!tsk_conn_cong(tsk))) { rc = tipc_link_xmit(buf, dnode, ref); if (likely(!rc)) { tsk->sent_unacked++; @@ -1050,7 +1100,7 @@ next: goto next; } if (rc == -EMSGSIZE) { - port->max_pkt = tipc_node_get_mtu(dnode, ref); + tsk->max_pkt = tipc_node_get_mtu(dnode, ref); goto next; } if (rc != -ELINKCONG) @@ -1089,10 +1139,10 @@ static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, /* tipc_sk_finish_conn - complete the setup of a connection */ -static void tipc_sk_finish_conn(struct tipc_port *port, u32 peer_port, +static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, u32 peer_node) { - struct tipc_msg *msg = &port->phdr; + struct tipc_msg *msg = &tsk->phdr; msg_set_destnode(msg, peer_node); msg_set_destport(msg, peer_port); @@ -1100,12 +1150,12 @@ static void tipc_sk_finish_conn(struct tipc_port *port, u32 peer_port, msg_set_lookup_scope(msg, 0); msg_set_hdr_sz(msg, SHORT_H_SIZE); - port->probing_interval = CONN_PROBING_INTERVAL; - port->probing_state = TIPC_CONN_OK; - port->connected = 1; - k_start_timer(&port->timer, port->probing_interval); - tipc_node_add_conn(peer_node, port->ref, peer_port); - port->max_pkt = tipc_node_get_mtu(peer_node, port->ref); + tsk->probing_interval = CONN_PROBING_INTERVAL; + tsk->probing_state = TIPC_CONN_OK; + tsk->connected = 1; + k_start_timer(&tsk->timer, tsk->probing_interval); + tipc_node_add_conn(peer_node, tsk->ref, peer_port); + tsk->max_pkt = tipc_node_get_mtu(peer_node, tsk->ref); } /** @@ -1132,17 +1182,17 @@ static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) } /** - * anc_data_recv - optionally capture ancillary data for received message + * tipc_sk_anc_data_recv - optionally capture ancillary data for received message * @m: descriptor for message info * @msg: received message header - * @tport: TIPC port associated with message + * @tsk: TIPC port associated with message * * Note: Ancillary data is not captured if not requested by receiver. * * Returns 0 if successful, otherwise errno */ -static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, - struct tipc_port *tport) +static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, + struct tipc_sock *tsk) { u32 anc_data[3]; u32 err; @@ -1185,10 +1235,10 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, anc_data[2] = msg_nameupper(msg); break; case TIPC_CONN_MSG: - has_name = (tport->conn_type != 0); - anc_data[0] = tport->conn_type; - anc_data[1] = tport->conn_instance; - anc_data[2] = tport->conn_instance; + has_name = (tsk->conn_type != 0); + anc_data[0] = tsk->conn_type; + anc_data[1] = tsk->conn_instance; + anc_data[2] = tsk->conn_instance; break; default: has_name = 0; @@ -1202,17 +1252,17 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } -static void tipc_sk_send_ack(struct tipc_port *port, uint ack) +static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) { struct sk_buff *buf = NULL; struct tipc_msg *msg; - u32 peer_port = tsk_peer_port(port); - u32 dnode = tsk_peer_node(port); + u32 peer_port = tsk_peer_port(tsk); + u32 dnode = tsk_peer_node(tsk); - if (!port->connected) + if (!tsk->connected) return; buf = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0, dnode, - tipc_own_addr, peer_port, port->ref, TIPC_OK); + tipc_own_addr, peer_port, tsk->ref, TIPC_OK); if (!buf) return; msg = buf_msg(buf); @@ -1270,7 +1320,6 @@ static int tipc_recvmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; struct sk_buff *buf; struct tipc_msg *msg; long timeo; @@ -1313,7 +1362,7 @@ restart: set_orig_addr(m, msg); /* Capture ancillary data (optional) */ - res = anc_data_recv(m, msg, port); + res = tipc_sk_anc_data_recv(m, msg, tsk); if (res) goto exit; @@ -1340,7 +1389,7 @@ restart: if (likely(!(flags & MSG_PEEK))) { if ((sock->state != SS_READY) && (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(port, tsk->rcv_unacked); + tipc_sk_send_ack(tsk, tsk->rcv_unacked); tsk->rcv_unacked = 0; } tsk_advance_rx_queue(sk); @@ -1367,7 +1416,6 @@ static int tipc_recv_stream(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; struct sk_buff *buf; struct tipc_msg *msg; long timeo; @@ -1412,7 +1460,7 @@ restart: /* Optionally capture sender's address & ancillary data of first msg */ if (sz_copied == 0) { set_orig_addr(m, msg); - res = anc_data_recv(m, msg, port); + res = tipc_sk_anc_data_recv(m, msg, tsk); if (res) goto exit; } @@ -1451,7 +1499,7 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(port, tsk->rcv_unacked); + tipc_sk_send_ack(tsk, tsk->rcv_unacked); tsk->rcv_unacked = 0; } tsk_advance_rx_queue(sk); @@ -1513,10 +1561,8 @@ static void tipc_data_ready(struct sock *sk) static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) { struct sock *sk = &tsk->sk; - struct tipc_port *port = &tsk->port; struct socket *sock = sk->sk_socket; struct tipc_msg *msg = buf_msg(*buf); - int retval = -TIPC_ERR_NO_PORT; if (msg_mcast(msg)) @@ -1528,10 +1574,10 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) if (tsk_peer_msg(tsk, msg)) { if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; - port->connected = 0; + tsk->connected = 0; /* let timer expire on it's own */ - tipc_node_remove_conn(tsk_peer_node(port), - port->ref); + tipc_node_remove_conn(tsk_peer_node(tsk), + tsk->ref); } retval = TIPC_OK; } @@ -1556,8 +1602,8 @@ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) break; } - tipc_sk_finish_conn(port, msg_origport(msg), msg_orignode(msg)); - msg_set_importance(&port->phdr, msg_importance(msg)); + tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg)); + msg_set_importance(&tsk->phdr, msg_importance(msg)); sock->state = SS_CONNECTED; /* If an incoming message is an 'ACK-', it should be @@ -1715,7 +1761,6 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *buf) int tipc_sk_rcv(struct sk_buff *buf) { struct tipc_sock *tsk; - struct tipc_port *port; struct sock *sk; u32 dport = msg_destport(buf_msg(buf)); int rc = TIPC_OK; @@ -1728,7 +1773,6 @@ int tipc_sk_rcv(struct sk_buff *buf) rc = tipc_msg_eval(buf, &dnode); goto exit; } - port = &tsk->port; sk = &tsk->sk; /* Queue message */ @@ -1931,7 +1975,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) { struct sock *new_sk, *sk = sock->sk; struct sk_buff *buf; - struct tipc_port *new_port; + struct tipc_sock *new_tsock; struct tipc_msg *msg; long timeo; int res; @@ -1954,7 +1998,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) goto exit; new_sk = new_sock->sk; - new_port = &tipc_sk(new_sk)->port; + new_tsock = tipc_sk(new_sk); msg = buf_msg(buf); /* we lock on new_sk; but lockdep sees the lock on sk */ @@ -1967,13 +2011,13 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) tsk_rej_rx_queue(new_sk); /* Connect new socket to it's peer */ - tipc_sk_finish_conn(new_port, msg_origport(msg), msg_orignode(msg)); + tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); new_sock->state = SS_CONNECTED; - tsk_set_importance(new_port, msg_importance(msg)); + tsk_set_importance(new_tsock, msg_importance(msg)); if (msg_named(msg)) { - new_port->conn_type = msg_nametype(msg); - new_port->conn_instance = msg_nameinst(msg); + new_tsock->conn_type = msg_nametype(msg); + new_tsock->conn_instance = msg_nameinst(msg); } /* @@ -2009,7 +2053,6 @@ static int tipc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; struct sk_buff *buf; u32 dnode; int res; @@ -2032,20 +2075,20 @@ restart: goto restart; } if (tipc_msg_reverse(buf, &dnode, TIPC_CONN_SHUTDOWN)) - tipc_link_xmit(buf, dnode, port->ref); - tipc_node_remove_conn(dnode, port->ref); + tipc_link_xmit(buf, dnode, tsk->ref); + tipc_node_remove_conn(dnode, tsk->ref); } else { - dnode = tsk_peer_node(port); + dnode = tsk_peer_node(tsk); buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tipc_own_addr, - tsk_peer_port(port), - port->ref, TIPC_CONN_SHUTDOWN); - tipc_link_xmit(buf, dnode, port->ref); + tsk_peer_port(tsk), + tsk->ref, TIPC_CONN_SHUTDOWN); + tipc_link_xmit(buf, dnode, tsk->ref); } - port->connected = 0; + tsk->connected = 0; sock->state = SS_DISCONNECTING; - tipc_node_remove_conn(dnode, port->ref); + tipc_node_remove_conn(dnode, tsk->ref); /* fall through */ case SS_DISCONNECTING: @@ -2069,7 +2112,6 @@ restart: static void tipc_sk_timeout(unsigned long ref) { struct tipc_sock *tsk; - struct tipc_port *port; struct sock *sk; struct sk_buff *buf = NULL; u32 peer_port, peer_node; @@ -2078,17 +2120,16 @@ static void tipc_sk_timeout(unsigned long ref) if (!tsk) goto exit; sk = &tsk->sk; - port = &tsk->port; bh_lock_sock(sk); - if (!port->connected) { + if (!tsk->connected) { bh_unlock_sock(sk); goto exit; } - peer_port = tsk_peer_port(port); - peer_node = tsk_peer_node(port); + peer_port = tsk_peer_port(tsk); + peer_node = tsk_peer_node(tsk); - if (port->probing_state == TIPC_CONN_PROBING) { + if (tsk->probing_state == TIPC_CONN_PROBING) { /* Previous probe not answered -> self abort */ buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr, @@ -2098,8 +2139,8 @@ static void tipc_sk_timeout(unsigned long ref) buf = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, peer_node, tipc_own_addr, peer_port, ref, TIPC_OK); - port->probing_state = TIPC_CONN_PROBING; - k_start_timer(&port->timer, port->probing_interval); + tsk->probing_state = TIPC_CONN_PROBING; + k_start_timer(&tsk->timer, tsk->probing_interval); } bh_unlock_sock(sk); if (buf) @@ -2108,37 +2149,37 @@ exit: tipc_sk_put(tsk); } -static int tipc_sk_publish(struct tipc_port *port, uint scope, +static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq) { struct publication *publ; u32 key; - if (port->connected) + if (tsk->connected) return -EINVAL; - key = port->ref + port->pub_count + 1; - if (key == port->ref) + key = tsk->ref + tsk->pub_count + 1; + if (key == tsk->ref) return -EADDRINUSE; publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, - scope, port->ref, key); + scope, tsk->ref, key); if (unlikely(!publ)) return -EINVAL; - list_add(&publ->pport_list, &port->publications); - port->pub_count++; - port->published = 1; + list_add(&publ->pport_list, &tsk->publications); + tsk->pub_count++; + tsk->published = 1; return 0; } -static int tipc_sk_withdraw(struct tipc_port *port, uint scope, +static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq) { struct publication *publ; struct publication *safe; int rc = -EINVAL; - list_for_each_entry_safe(publ, safe, &port->publications, pport_list) { + list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) { if (seq) { if (publ->scope != scope) continue; @@ -2157,12 +2198,12 @@ static int tipc_sk_withdraw(struct tipc_port *port, uint scope, publ->ref, publ->key); rc = 0; } - if (list_empty(&port->publications)) - port->published = 0; + if (list_empty(&tsk->publications)) + tsk->published = 0; return rc; } -static int tipc_sk_show(struct tipc_port *port, char *buf, +static int tipc_sk_show(struct tipc_sock *tsk, char *buf, int len, int full_id) { struct publication *publ; @@ -2172,26 +2213,26 @@ static int tipc_sk_show(struct tipc_port *port, char *buf, ret = tipc_snprintf(buf, len, "<%u.%u.%u:%u>:", tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr), - tipc_node(tipc_own_addr), port->ref); + tipc_node(tipc_own_addr), tsk->ref); else - ret = tipc_snprintf(buf, len, "%-10u:", port->ref); + ret = tipc_snprintf(buf, len, "%-10u:", tsk->ref); - if (port->connected) { - u32 dport = tsk_peer_port(port); - u32 destnode = tsk_peer_node(port); + if (tsk->connected) { + u32 dport = tsk_peer_port(tsk); + u32 destnode = tsk_peer_node(tsk); ret += tipc_snprintf(buf + ret, len - ret, " connected to <%u.%u.%u:%u>", tipc_zone(destnode), tipc_cluster(destnode), tipc_node(destnode), dport); - if (port->conn_type != 0) + if (tsk->conn_type != 0) ret += tipc_snprintf(buf + ret, len - ret, - " via {%u,%u}", port->conn_type, - port->conn_instance); - } else if (port->published) { + " via {%u,%u}", tsk->conn_type, + tsk->conn_instance); + } else if (tsk->published) { ret += tipc_snprintf(buf + ret, len - ret, " bound to"); - list_for_each_entry(publ, &port->publications, pport_list) { + list_for_each_entry(publ, &tsk->publications, pport_list) { if (publ->lower == publ->upper) ret += tipc_snprintf(buf + ret, len - ret, " {%u,%u}", publ->type, @@ -2226,7 +2267,7 @@ struct sk_buff *tipc_sk_socks_show(void) tsk = tipc_sk_get_next(&ref); for (; tsk; tsk = tipc_sk_get_next(&ref)) { lock_sock(&tsk->sk); - str_len += tipc_sk_show(&tsk->port, pb + str_len, + str_len += tipc_sk_show(tsk, pb + str_len, pb_len - str_len, 0); release_sock(&tsk->sk); tipc_sk_put(tsk); @@ -2249,7 +2290,7 @@ void tipc_sk_reinit(void) for (; tsk; tsk = tipc_sk_get_next(&ref)) { lock_sock(&tsk->sk); - msg = &tsk->port.phdr; + msg = &tsk->phdr; msg_set_prevnode(msg, tipc_own_addr); msg_set_orignode(msg, tipc_own_addr); release_sock(&tsk->sk); @@ -2512,7 +2553,6 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; u32 value; int res; @@ -2530,16 +2570,16 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - res = tsk_set_importance(port, value); + res = tsk_set_importance(tsk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) - tsk_set_unreliable(port, value); + tsk_set_unreliable(tsk, value); else res = -ENOPROTOOPT; break; case TIPC_DEST_DROPPABLE: - tsk_set_unreturnable(port, value); + tsk_set_unreturnable(tsk, value); break; case TIPC_CONN_TIMEOUT: tipc_sk(sk)->conn_timeout = value; @@ -2572,7 +2612,6 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; int len; u32 value; int res; @@ -2589,16 +2628,16 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - value = tsk_importance(port); + value = tsk_importance(tsk); break; case TIPC_SRC_DROPPABLE: - value = tsk_unreliable(port); + value = tsk_unreliable(tsk); break; case TIPC_DEST_DROPPABLE: - value = tsk_unreturnable(port); + value = tsk_unreturnable(tsk); break; case TIPC_CONN_TIMEOUT: - value = tipc_sk(sk)->conn_timeout; + value = tsk->conn_timeout; /* no need to set "res", since already 0 at this point */ break; case TIPC_NODE_RECVQ_DEPTH: diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 48772169bc77..baa43d03901e 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -36,85 +36,11 @@ #define _TIPC_SOCK_H #include -#include "msg.h" -#define TIPC_CONN_OK 0 -#define TIPC_CONN_PROBING 1 #define TIPC_CONNACK_INTV 256 #define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) #define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) - -/** - * struct tipc_port - TIPC port structure - * @lock: pointer to spinlock for controlling access to port - * @connected: non-zero if port is currently connected to a peer port - * @conn_type: TIPC type used when connection was established - * @conn_instance: TIPC instance used when connection was established - * @published: non-zero if port has one or more associated names - * @max_pkt: maximum packet size "hint" used when building messages sent by port - * @ref: unique reference to port in TIPC object registry - * @phdr: preformatted message header used when sending messages - * @port_list: adjacent ports in TIPC's global list of ports - * @publications: list of publications for port - * @pub_count: total # of publications port has made during its lifetime - * @probing_state: - * @probing_interval: - * @timer_ref: - */ -struct tipc_port { - int connected; - u32 conn_type; - u32 conn_instance; - int published; - u32 max_pkt; - u32 ref; - struct tipc_msg phdr; - struct list_head port_list; - struct list_head publications; - u32 pub_count; - u32 probing_state; - u32 probing_interval; - struct timer_list timer; -}; - -/** - * struct tipc_sock - TIPC socket structure - * @sk: socket - interacts with 'port' and with user via the socket API - * @port: port - interacts with 'sk' and with the rest of the TIPC stack - * @peer_name: the peer of the connection, if any - * @conn_timeout: the time we can wait for an unresponded setup request - * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue - * @link_cong: non-zero if owner must sleep because of link congestion - * @sent_unacked: # messages sent by socket, and not yet acked by peer - * @rcv_unacked: # messages read by user, but not yet acked back to peer - */ - -struct tipc_sock { - struct sock sk; - struct tipc_port port; - unsigned int conn_timeout; - atomic_t dupl_rcvcnt; - bool link_cong; - uint sent_unacked; - uint rcv_unacked; -}; - -static inline struct tipc_sock *tipc_sk(const struct sock *sk) -{ - return container_of(sk, struct tipc_sock, sk); -} - -static inline struct tipc_sock *tipc_port_to_sock(const struct tipc_port *port) -{ - return container_of(port, struct tipc_sock, port); -} - -static inline int tipc_sk_conn_cong(struct tipc_sock *tsk) -{ - return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; -} - int tipc_sk_rcv(struct sk_buff *buf); struct sk_buff *tipc_sk_socks_show(void); void tipc_sk_mcast_rcv(struct sk_buff *buf); -- cgit v1.2.3 From 690e36e726d00d2528bc569809048adf61550d80 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 23 Aug 2014 12:13:41 -0700 Subject: net: Allow raw buffers to be passed into the flow dissector. Drivers, and perhaps other entities we have not yet considered, sometimes want to know how deep the protocol headers go before deciding how large of an SKB to allocate and how much of the packet to place into the linear SKB area. For example, consider a driver which has a device which DMAs into pools of pages and then tells the driver where the data went in the DMA descriptor(s). The driver can then build an SKB and reference most of the data via SKB fragments (which are page/offset/length triplets). However at least some of the front of the packet should be placed into the linear SKB area, which comes before the fragments, so that packet processing can get at the headers efficiently. The first thing each protocol layer is going to do is a "pskb_may_pull()" so we might as well aggregate as much of this as possible while we're building the SKB in the driver. Part of supporting this is that we don't have an SKB yet, so we want to be able to let the flow dissector operate on a raw buffer in order to compute the offset of the end of the headers. So now we have a __skb_flow_dissect() which takes an explicit data pointer and length. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++++------ include/net/flow_keys.h | 14 ++++++++++++-- net/core/flow_dissector.c | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index abde271c18ae..18ddf9684a27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2567,20 +2567,26 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); -static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *buffer) +static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *data, int hlen, void *buffer) { - int hlen = skb_headlen(skb); - if (hlen - offset >= len) - return skb->data + offset; + return data + offset; - if (skb_copy_bits(skb, offset, buffer, len) < 0) + if (!skb || + skb_copy_bits(skb, offset, buffer, len) < 0) return NULL; return buffer; } +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + return __skb_header_pointer(skb, offset, len, skb->data, + skb_headlen(skb), buffer); +} + /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 6667a054763a..4040f63932c5 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -27,7 +27,17 @@ struct flow_keys { u8 ip_proto; }; -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow); -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto); +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, int hlen); +static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ + return __skb_flow_dissect(skb, flow, NULL, 0); +} +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen_proto); +static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ + return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); +} u32 flow_hash_from_keys(struct flow_keys *keys); #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5f362c1d0332..660c6492fb78 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -34,29 +34,40 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) { int poff = proto_ports_offset(ip_proto); + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + if (poff >= 0) { __be32 *ports, _ports; - ports = skb_header_pointer(skb, thoff + poff, - sizeof(_ports), &_ports); + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } -EXPORT_SYMBOL(skb_flow_get_ports); +EXPORT_SYMBOL(__skb_flow_get_ports); -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, void *data, int hlen) { int nhoff = skb_network_offset(skb); u8 ip_proto; __be16 proto = skb->protocol; + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + memset(flow, 0, sizeof(*flow)); again: @@ -65,7 +76,7 @@ again: const struct iphdr *iph; struct iphdr _iph; ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) return false; nhoff += iph->ihl * 4; @@ -83,7 +94,7 @@ ip: __be32 flow_label; ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) return false; @@ -113,7 +124,7 @@ ipv6: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) return false; @@ -126,7 +137,7 @@ ipv6: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; proto = hdr->proto; @@ -151,7 +162,7 @@ ipv6: __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; /* @@ -171,8 +182,9 @@ ipv6: const struct ethhdr *eth; struct ethhdr _eth; - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); if (!eth) return false; proto = eth->h_proto; @@ -194,12 +206,12 @@ ipv6: flow->n_proto = proto; flow->ip_proto = ip_proto; - flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); + flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); flow->thoff = (u16) nhoff; return true; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); static u32 hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) -- cgit v1.2.3 From 8fc54f68919298ff9689d980efb495707ef43f30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Aug 2014 20:58:54 +0200 Subject: net: use reciprocal_scale() helper Replace open codings of (((u64) * ) >> 32) with reciprocal_scale(). Signed-off-by: Daniel Borkmann Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- net/core/flow_dissector.c | 7 +++---- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- net/ipv4/udp.c | 6 +++--- net/ipv6/inet6_hashtables.c | 2 +- net/ipv6/udp.c | 4 ++-- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_expect.c | 3 ++- net/netfilter/nf_nat_core.c | 5 +++-- net/netfilter/xt_HMARK.c | 2 +- net/netfilter/xt_cluster.c | 3 ++- net/netfilter/xt_hashlimit.c | 2 +- net/sched/sch_fq_codel.c | 3 ++- 14 files changed, 24 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b65a5051361f..1421dad4cb29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3124,8 +3124,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) hash * map->len) >> 32]; - + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 660c6492fb78..8ffcc97871c8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -298,7 +298,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); @@ -371,9 +371,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map->len == 1) queue_index = map->queues[0]; else - queue_index = map->queues[ - ((u64)skb_get_hash(skb) * map->len) >> 32]; - + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 43116e8c8e13..9111a4e22155 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -229,7 +229,7 @@ begin: } } else if (score == hiscore && reuseport) { matches++; - if (((u64)phash * matches) >> 32 == 0) + if (reciprocal_scale(phash, matches) == 0) result = sk; phash = next_pseudo_random32(phash); } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2510c02c2d21..e90f83a3415b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -285,7 +285,7 @@ clusterip_hashfn(const struct sk_buff *skb, } /* node numbers are 1..n, not 0..n */ - return (((u64)hashval * config->num_total_nodes) >> 32) + 1; + return reciprocal_scale(hashval, config->num_total_nodes) + 1; } static inline int diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f57c0e4c2326..32f9571e776b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -224,7 +224,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, remaining = (high - low) + 1; rand = prandom_u32(); - first = (((u64)rand * remaining) >> 32) + low; + first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ @@ -448,7 +448,7 @@ begin: } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -529,7 +529,7 @@ begin: } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 262e13c02ec2..826019008958 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -198,7 +198,7 @@ begin: } } else if (score == hiscore && reuseport) { matches++; - if (((u64)phash * matches) >> 32 == 0) + if (reciprocal_scale(phash, matches) == 0) result = sk; phash = next_pseudo_random32(phash); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4836af8f582d..25ffe737b975 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -243,7 +243,7 @@ begin: goto exact_match; } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -323,7 +323,7 @@ begin: } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0b634e7f1652..5016a6929085 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -142,7 +142,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) static u32 __hash_bucket(u32 hash, unsigned int size) { - return ((u64)hash * size) >> 32; + return reciprocal_scale(hash, size); } static u32 hash_bucket(u32 hash, const struct net *net) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index f87e8f68ad45..91a1837acd0e 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -83,7 +83,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); - return ((u64)hash * nf_ct_expect_hsize) >> 32; + + return reciprocal_scale(hash, nf_ct_expect_hsize); } struct nf_conntrack_expect * diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 552f97cd9fde..4e0b47831d43 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -126,7 +126,8 @@ hash_by_src(const struct net *net, u16 zone, /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); - return ((u64)hash * net->ct.nat_htable_size) >> 32; + + return reciprocal_scale(hash, net->ct.nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -274,7 +275,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, } var_ipp->all[i] = (__force __u32) - htonl(minip + (((u64)j * dist) >> 32)); + htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 73b73f687c58..02afaf48a729 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -126,7 +126,7 @@ hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); hash = hash ^ (t->proto & info->proto_mask); - return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; + return reciprocal_scale(hash, info->hmodulus) + info->hoffset; } static void diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index f4af1bfafb1c..96fa26b20b67 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -55,7 +55,8 @@ xt_cluster_hash(const struct nf_conn *ct, WARN_ON(1); break; } - return (((u64)hash * info->total_nodes) >> 32); + + return reciprocal_scale(hash, info->total_nodes); } static inline bool diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 47dc6836830a..52eb3e03458d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -135,7 +135,7 @@ hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) * give results between [0 and cfg.size-1] and same hash distribution, * but using a multiply, less expensive than a divide */ - return ((u64)hash * ht->cfg.size) >> 32; + return reciprocal_scale(hash, ht->cfg.size); } static struct dsthash_ent * diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 063b726bf1f8..cc56c8bb9bed 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -77,7 +77,8 @@ static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src ^ keys.ip_proto, (__force u32)keys.ports, q->perturbation); - return ((u64)hash * q->flows_cnt) >> 32; + + return reciprocal_scale(hash, q->flows_cnt); } static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, -- cgit v1.2.3 From e2a093ff0dbfa4c5d99f25241cf33325e9691d91 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Wed, 6 Aug 2014 13:52:49 +0200 Subject: netfilter: nft_meta: add pkttype support Add pkttype support for ip, ipv6 and inet families of tables. This allows you to fetch the meta packet type based on the link layer information. The loopback traffic is a special case, the packet type is guessed from the network layer header. No special handling for bridge and arp since we're not going to see such traffic in the loopback interface. Joint work with Alvaro Neira Ayuso Signed-off-by: Alvaro Neira Ayuso Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 801bdd1e56e3..98144cdd8986 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -571,6 +571,7 @@ enum nft_exthdr_attributes { * @NFT_META_L4PROTO: layer 4 protocol number * @NFT_META_BRI_IIFNAME: packet input bridge interface name * @NFT_META_BRI_OIFNAME: packet output bridge interface name + * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback */ enum nft_meta_keys { NFT_META_LEN, @@ -592,6 +593,7 @@ enum nft_meta_keys { NFT_META_L4PROTO, NFT_META_BRI_IIFNAME, NFT_META_BRI_OIFNAME, + NFT_META_PKTTYPE, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 852b178c6ae7..4f2862fc12c2 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include /* for TCP_TIME_WAIT */ @@ -124,6 +127,30 @@ void nft_meta_get_eval(const struct nft_expr *expr, dest->data[0] = skb->secmark; break; #endif + case NFT_META_PKTTYPE: + if (skb->pkt_type != PACKET_LOOPBACK) { + dest->data[0] = skb->pkt_type; + break; + } + + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + case NFPROTO_IPV6: + if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + default: + WARN_ON(1); + goto err; + } + break; default: WARN_ON(1); goto err; @@ -195,6 +222,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif + case NFT_META_PKTTYPE: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From afc5be3079796b024823bad42dc5ebf716453575 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Sun, 24 Aug 2014 14:08:36 +0200 Subject: netfilter: nft_meta: Add cpu attribute support Add cpu support to meta expresion. This allows you to match packets with cpu number. Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 98144cdd8986..c9b6f00a3fb7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -572,6 +572,7 @@ enum nft_exthdr_attributes { * @NFT_META_BRI_IIFNAME: packet input bridge interface name * @NFT_META_BRI_OIFNAME: packet output bridge interface name * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback + * @NFT_META_CPU: cpu id through smp_processor_id() */ enum nft_meta_keys { NFT_META_LEN, @@ -594,6 +595,7 @@ enum nft_meta_keys { NFT_META_BRI_IIFNAME, NFT_META_BRI_OIFNAME, NFT_META_PKTTYPE, + NFT_META_CPU, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 4f2862fc12c2..843e099a962d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include /* for TCP_TIME_WAIT */ @@ -151,6 +152,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; } break; + case NFT_META_CPU: + dest->data[0] = smp_processor_id(); + break; default: WARN_ON(1); goto err; @@ -223,6 +227,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_SECMARK: #endif case NFT_META_PKTTYPE: + case NFT_META_CPU: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From ecc245c2bd5dcee91e6818fd3e7fb6454ad2ca06 Mon Sep 17 00:00:00 2001 From: Vytas Dauksa Date: Fri, 4 Apr 2014 16:10:14 +0100 Subject: netfilter: ipset: Removed invalid IPSET_ATTR_MARKMASK validation Markmask is an u32, hence it can't be greater then 4294967295 ( i.e. 0xffffffff ). This was causing smatch warning: net/netfilter/ipset/ip_set_hash_gen.h:1084 hash_ipmark_create() warn: impossible condition '(markmask > 4294967295) => (0-u32max > u32max)' Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 61c7fb052802..0398a92da6cc 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1093,7 +1093,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (tb[IPSET_ATTR_MARKMASK]) { markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); - if ((markmask > 4294967295u) || markmask == 0) + if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; } #endif -- cgit v1.2.3 From 6e41ee684ea0c338e7d83fc88725581027856595 Mon Sep 17 00:00:00 2001 From: Sergey Popovich Date: Mon, 5 May 2014 11:07:06 +0300 Subject: netfilter: ipset: netnet,netportnet: Fix value range support for IPv4 Ranges of values are broken with hash:net,net and hash:net,port,net. hash:net,net ============ # ipset create test-nn hash:net,net # ipset add test-nn 10.0.10.1-10.0.10.127,10.0.0.0/8 # ipset list test-nn Name: test-nn Type: hash:net,net Revision: 0 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 16960 References: 0 Members: 10.0.10.1,10.0.0.0/8 # ipset test test-nn 10.0.10.65,10.0.0.1 10.0.10.65,10.0.0.1 is NOT in set test-nn. # ipset test test-nn 10.0.10.1,10.0.0.1 10.0.10.1,10.0.0.1 is in set test-nn. hash:net,port,net ================= # ipset create test-npn hash:net,port,net # ipset add test-npn 10.0.10.1-10.0.10.127,tcp:80,10.0.0.0/8 # ipset list test-npn Name: test-npn Type: hash:net,port,net Revision: 0 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 17344 References: 0 Members: 10.0.10.8/29,tcp:80,10.0.0.0 10.0.10.16/28,tcp:80,10.0.0.0 10.0.10.2/31,tcp:80,10.0.0.0 10.0.10.64/26,tcp:80,10.0.0.0 10.0.10.32/27,tcp:80,10.0.0.0 10.0.10.4/30,tcp:80,10.0.0.0 10.0.10.1,tcp:80,10.0.0.0 # ipset list test-npn # ipset test test-npn 10.0.10.126,tcp:80,10.0.0.2 10.0.10.126,tcp:80,10.0.0.2 is NOT in set test-npn. # ipset test test-npn 10.0.10.126,tcp:80,10.0.0.0 10.0.10.126,tcp:80,10.0.0.0 is in set test-npn. # ipset create test-npn hash:net,port,net # ipset add test-npn 10.0.10.0/24,tcp:80-81,10.0.0.0/8 # ipset list test-npn Name: test-npn Type: hash:net,port,net Revision: 0 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 17024 References: 0 Members: 10.0.10.0,tcp:80,10.0.0.0 10.0.10.0,tcp:81,10.0.0.0 # ipset test test-npn 10.0.10.126,tcp:80,10.0.0.0 10.0.10.126,tcp:80,10.0.0.0 is NOT in set test-npn. # ipset test test-npn 10.0.10.0,tcp:80,10.0.0.0 10.0.10.0,tcp:80,10.0.0.0 is in set test-npn. Correctly setup from..to variables where no IPSET_ATTR_IP_TO{,2} attribute is given, so in range processing loop we construct proper cidr value. Check whenever we have no ranges and can short cut in hash:net,net properly. Use unlikely() where appropriate, to comply with other modules. Signed-off-by: Sergey Popovich Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_netnet.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_netportnet.c | 6 ++++-- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3e99987e4bf2..96b131366e7b 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -203,7 +203,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], flags |= (IPSET_FLAG_NOMATCH << 16); } - if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] && + if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); @@ -219,9 +219,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (ip_to < ip) swap(ip, ip_to); - if (ip + UINT_MAX == ip_to) + if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr[0]); ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { @@ -230,10 +231,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (ip2_to < ip2_from) swap(ip2_from, ip2_to); - if (ip2_from + UINT_MAX == ip2_to) + if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); if (retried) ip = ntohl(h->next.ip[0]); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index c0d2ba73f8b2..2f0034347189 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -257,7 +257,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr[0]); port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { @@ -275,7 +276,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); if (retried) ip = ntohl(h->next.ip[0]); -- cgit v1.2.3 From 94729f8a1e9d38c8df6c83799fde8d2eaef2ff54 Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Tue, 5 Aug 2014 04:56:21 -0700 Subject: netfilter: ipset: Resolve missing-field-initializer warnings Resolve missing-field-initializer warnings by providing a directed initializer. Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_ip.c | 4 ++-- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 4 ++-- net/netfilter/ipset/ip_set_bitmap_port.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ip.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_ipport.c | 8 ++++---- net/netfilter/ipset/ip_set_hash_ipportip.c | 8 ++++---- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 6f1f9f494808..dafdb39ef042 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -112,7 +112,7 @@ bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ip_adt_elem e = { }; + struct bitmap_ip_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -132,7 +132,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; u32 ip = 0, ip_to = 0; - struct bitmap_ip_adt_elem e = { }; + struct bitmap_ip_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 740eabededd9..dbad505e79e3 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -203,7 +203,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = {}; + struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -232,7 +232,7 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = {}; + struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0; int ret = 0; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index cf99676e69f8..a4b65ae1986c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -104,7 +104,7 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_port_adt_elem e = {}; + struct bitmap_port_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be16 __port; u16 port = 0; @@ -129,7 +129,7 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_port_adt_elem e = {}; + struct bitmap_port_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port; /* wraparound */ u16 port_to; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd40607f878e..e52739938533 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -84,7 +84,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip4_elem e = {}; + struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be32 ip; @@ -103,7 +103,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip4_elem e = {}; + struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, hosts; int ret = 0; @@ -222,7 +222,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip6_elem e = {}; + struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); @@ -239,7 +239,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip6_elem e = {}; + struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7597b82a8b03..f37a5ae8a5e0 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -94,7 +94,7 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem e = { }; + struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -111,7 +111,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem e = { }; + struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; bool with_ports = false; @@ -258,7 +258,7 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem e = { }; + struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -275,7 +275,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem e = { }; + struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 672655ffd573..41ef00eda874 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -95,7 +95,7 @@ hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem e = { }; + struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -113,7 +113,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem e = { }; + struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; bool with_ports = false; @@ -265,7 +265,7 @@ hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem e = { }; + struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -283,7 +283,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem e = { }; + struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; -- cgit v1.2.3 From 1b05756c48ea07ced9604ef01d11194d936da163 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 5 Aug 2014 22:02:34 +0200 Subject: netfilter: ipset: Fix warn: integer overflows 'sizeof(*map) + size * set->dsize' Dan Carpenter reported that the static checker emits the warning net/netfilter/ipset/ip_set_list_set.c:600 init_list_set() warn: integer overflows 'sizeof(*map) + size * set->dsize' Limit the maximal number of elements in list type of sets. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_list.h | 1 + net/netfilter/ipset/ip_set_list_set.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set_list.h b/include/linux/netfilter/ipset/ip_set_list.h index 68c2aea897f5..fe2622a00151 100644 --- a/include/linux/netfilter/ipset/ip_set_list.h +++ b/include/linux/netfilter/ipset/ip_set_list.h @@ -6,5 +6,6 @@ #define IP_SET_LIST_DEFAULT_SIZE 8 #define IP_SET_LIST_MIN_SIZE 4 +#define IP_SET_LIST_MAX_SIZE 65536 #endif /* __IP_SET_LIST_H */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 3e2317f3cf68..f87adbad6076 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -597,7 +597,9 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) struct set_elem *e; u32 i; - map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); + map = kzalloc(sizeof(*map) + + min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, + GFP_KERNEL); if (!map) return false; -- cgit v1.2.3 From 573e8fca255a27e3573b51f9b183d62641c47a3d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:33:47 -0700 Subject: net: skb_gro_checksum_* functions Add skb_gro_checksum_validate, skb_gro_checksum_validate_zero_check, and skb_gro_checksum_simple_validate, and __skb_gro_checksum_complete. These are the cognates of the normal checksum functions but are used in the gro_receive path and operate on GRO related fields in sk_buffs. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 76 +++++++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 34 ++++++++++++++++++++- 2 files changed, 107 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e2b0b8b5cd7..eb73444e1bd0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1883,7 +1883,13 @@ struct napi_gro_cb { u16 proto; /* Used in udp_gro_receive */ - u16 udp_mark; + u8 udp_mark:1; + + /* GRO checksum is valid */ + u8 csum_valid:1; + + /* Number encapsulation layers crossed */ + u8 encapsulation; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2154,11 +2160,77 @@ static inline void *skb_gro_network_header(struct sk_buff *skb) static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, csum_partial(start, len, 0)); } +/* GRO checksum functions. These are logical equivalents of the normal + * checksum functions (in skbuff.h) except that they operate on the GRO + * offsets and fields in sk_buff. + */ + +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); + +static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, + bool zero_okay, + __sum16 check) +{ + return (skb->ip_summed != CHECKSUM_PARTIAL && + (skb->ip_summed != CHECKSUM_UNNECESSARY || + (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) && + (!zero_okay || check)); +} + +static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, + __wsum psum) +{ + if (NAPI_GRO_CB(skb)->csum_valid && + !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) + return 0; + + NAPI_GRO_CB(skb)->csum = psum; + + return __skb_gro_checksum_complete(skb); +} + +/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level + * checksum or an encapsulated one during GRO. This saves work + * if we fallback to normal path with the packet. + */ +static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (NAPI_GRO_CB(skb)->encapsulation) + skb->encapsulation = 1; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->encapsulation = 0; + } +} + +#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ + compute_pseudo) \ +({ \ + __sum16 __ret = 0; \ + if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ + __ret = __skb_gro_checksum_validate_complete(skb, \ + compute_pseudo(skb, proto)); \ + if (!__ret) \ + skb_gro_incr_csum_unnecessary(skb); \ + __ret; \ +}) + +#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) + +#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ + compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) + +#define skb_gro_checksum_simple_validate(skb) \ + __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/net/core/dev.c b/net/core/dev.c index 1421dad4cb29..b6a718ec11c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3962,7 +3962,13 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; gro_list_prepare(napi, skb); - NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + } else { + NAPI_GRO_CB(skb)->csum_valid = 0; + } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3975,6 +3981,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + NAPI_GRO_CB(skb)->encapsulation = 0; pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; @@ -4205,6 +4212,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. -- cgit v1.2.3 From 758f75d1ffa9ef482ae095f40087cf217e1f41b0 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:34:22 -0700 Subject: gre: call skb_gro_checksum_simple_validate Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6556263c8fa5..d1bd16937d93 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -119,28 +119,6 @@ out: return segs; } -/* Compute the whole skb csum in s/w and store it, then verify GRO csum - * starting from gro_offset. - */ -static __sum16 gro_skb_checksum(struct sk_buff *skb) -{ - __sum16 sum; - - skb->csum = skb_checksum(skb, 0, skb->len, 0); - NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, - csum_partial(skb->data, skb_gro_offset(skb), 0)); - sum = csum_fold(NAPI_GRO_CB(skb)->csum); - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { - if (unlikely(!sum) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } else { - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - } - - return sum; -} - static struct sk_buff **gre_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -192,22 +170,15 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, if (unlikely(!greh)) goto out_unlock; } - if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ - __sum16 csum = 0; - - if (skb->ip_summed == CHECKSUM_COMPLETE) - csum = csum_fold(NAPI_GRO_CB(skb)->csum); - /* Don't trust csum error calculated/reported by h/w */ - if (skb->ip_summed == CHECKSUM_NONE || csum != 0) - csum = gro_skb_checksum(skb); - - /* GRE CSUM is the 1's complement of the 1's complement sum - * of the GRE hdr plus payload so it should add up to 0xffff - * (and 0 after csum_fold()) just like the IPv4 hdr csum. - */ - if (csum) + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (greh->flags & GRE_CSUM) { + if (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_simple_validate(skb)) goto out_unlock; + NAPI_GRO_CB(skb)->encapsulation++; } + flush = 0; for (p = *head; p; p = p->next) { -- cgit v1.2.3 From 149d0774a729497c6a876260d3884826088724b6 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:34:30 -0700 Subject: tcp: Call skb_gro_checksum_validate In tcp[64]_gro_receive call skb_gro_checksum_validate to validate TCP checksum in the gro context. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 27 +++------------------------ net/ipv6/tcpv6_offload.c | 26 +++----------------------- 2 files changed, 6 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index bc1b83cb8309..72912533a191 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -288,35 +288,14 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb) static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - /* Use the IP hdr immediately proceeding for this transport */ - const struct iphdr *iph = skb_gro_network_header(skb); - __wsum wsum; - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (NAPI_GRO_CB(skb)->flush) - goto skip_csum; - - wsum = NAPI_GRO_CB(skb)->csum; - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), - 0); - - /* fall through */ - - case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, - wsum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - + if (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate(skb, IPPROTO_TCP, + inet_gro_compute_pseudo)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } -skip_csum: return tcp_gro_receive(head, skb); } diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 01b0ff9a0c2c..dbb3d9262bf6 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -35,34 +35,14 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb) static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - const struct ipv6hdr *iph = skb_gro_network_header(skb); - __wsum wsum; - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (NAPI_GRO_CB(skb)->flush) - goto skip_csum; - - wsum = NAPI_GRO_CB(skb)->csum; - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), - wsum); - - /* fall through */ - - case CHECKSUM_COMPLETE: - if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, - wsum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - + if (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate(skb, IPPROTO_TCP, + ip6_gro_compute_pseudo)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } -skip_csum: return tcp_gro_receive(head, skb); } -- cgit v1.2.3 From 57c67ff4bd92af634f7c91c40eb02a96dd785dda Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:34:44 -0700 Subject: udp: additional GRO support Implement GRO for UDPv6. Add UDP checksum verification in gro_receive for both UDP4 and UDP6 calling skb_gro_checksum_validate_zero_check. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/udp.h | 18 +++++++++++++++ net/ipv4/udp.c | 1 + net/ipv4/udp_offload.c | 61 ++++++++++++++++++++++++++++++++++++-------------- net/ipv6/udp_offload.c | 33 +++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index 70f941368ace..16f4e80f0519 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -158,6 +158,24 @@ static inline __sum16 udp_v4_check(int len, __be32 saddr, void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); +struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, + struct udphdr *uh); +int udp_gro_complete(struct sk_buff *skb, int nhoff); + +static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) +{ + struct udphdr *uh; + unsigned int hlen, off; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) + uh = skb_gro_header_slow(skb, hlen, off); + + return uh; +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline void udp_lib_hash(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32f9571e776b..3549c21fe5f7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 59035bc3008d..8ed460e3753c 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -228,29 +228,22 @@ unlock: } EXPORT_SYMBOL(udp_del_offload); -static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, + struct udphdr *uh) { struct udp_offload_priv *uo_priv; struct sk_buff *p, **pp = NULL; - struct udphdr *uh, *uh2; - unsigned int hlen, off; + struct udphdr *uh2; + unsigned int off = skb_gro_offset(skb); int flush = 1; if (NAPI_GRO_CB(skb)->udp_mark || - (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + (!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the udp gro layer */ NAPI_GRO_CB(skb)->udp_mark = 1; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*uh); - uh = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - uh = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!uh)) - goto out; - } + NAPI_GRO_CB(skb)->encapsulation++; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); @@ -269,7 +262,12 @@ unflush: continue; uh2 = (struct udphdr *)(p->data + off); - if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + + /* Match ports and either checksums are either both zero + * or nonzero. + */ + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) || + (!uh->check ^ !uh2->check)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -286,7 +284,24 @@ out: return pp; } -static int udp_gro_complete(struct sk_buff *skb, int nhoff) +static struct sk_buff **udp4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_gro_udphdr(skb); + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (unlikely(!uh) || + (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return udp_gro_receive(head, skb, uh); +} + +int udp_gro_complete(struct sk_buff *skb, int nhoff) { struct udp_offload_priv *uo_priv; __be16 newlen = htons(skb->len - nhoff); @@ -311,12 +326,24 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff) return err; } +int udp4_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + + if (uh->check) + uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, + iph->daddr, 0); + + return udp_gro_complete(skb, nhoff); +} + static const struct net_offload udpv4_offload = { .callbacks = { .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, - .gro_receive = udp_gro_receive, - .gro_complete = udp_gro_complete, + .gro_receive = udp4_gro_receive, + .gro_complete = udp4_gro_complete, }, }; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 0ae3d98f83e0..b13e377e9c53 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -10,6 +10,7 @@ * UDPv6 GSO support */ #include +#include #include #include #include @@ -127,10 +128,42 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, out: return segs; } + +static struct sk_buff **udp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_gro_udphdr(skb); + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (unlikely(!uh) || + (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return udp_gro_receive(head, skb, uh); +} + +int udp6_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + + if (uh->check) + uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, + &ipv6h->daddr, 0); + + return udp_gro_complete(skb, nhoff); +} + static const struct net_offload udpv6_offload = { .callbacks = { .gso_send_check = udp6_ufo_send_check, .gso_segment = udp6_ufo_fragment, + .gro_receive = udp6_gro_receive, + .gro_complete = udp6_gro_complete, }, }; -- cgit v1.2.3 From 48a5fc773190bd5339869003fa65d38559bb8890 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:34:52 -0700 Subject: gre: When GRE csum is present count as encap layer wrt csum In GRE demux if the GRE checksum pop rcv encapsulation so that any encapsulated checksums are treated as tunnel checksums. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 0485bf7f8f03..7c1a8ff974dd 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -125,6 +125,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, *csum_err = true; return -EINVAL; } + skb_pop_rcv_encapsulation(skb); options++; } -- cgit v1.2.3 From 67ba4152e8b77eada6a9c64e3c2c84d6112794fc Mon Sep 17 00:00:00 2001 From: Ian Morris Date: Sun, 24 Aug 2014 21:53:10 +0100 Subject: ipv6: White-space cleansing : Line Layouts This patch makes no changes to the logic of the code but simply addresses coding style issues as detected by checkpatch. Both objdump and diff -w show no differences. A number of items are addressed in this patch: * Multiple spaces converted to tabs * Spaces before tabs removed. * Spaces in pointer typing cleansed (char *)foo etc. * Remove space after sizeof * Ensure spacing around comparators such as if statements. Signed-off-by: Ian Morris Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++--- net/ipv6/af_inet6.c | 12 ++--- net/ipv6/ah6.c | 16 +++---- net/ipv6/datagram.c | 8 ++-- net/ipv6/esp6.c | 8 ++-- net/ipv6/exthdrs.c | 2 +- net/ipv6/icmp.c | 10 ++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/inet6_hashtables.c | 2 +- net/ipv6/ip6_flowlabel.c | 18 +++---- net/ipv6/ip6_icmp.c | 2 +- net/ipv6/ip6_input.c | 6 +-- net/ipv6/ip6_offload.c | 2 +- net/ipv6/ip6_output.c | 20 ++++---- net/ipv6/ip6_tunnel.c | 32 ++++++------- net/ipv6/ip6mr.c | 4 +- net/ipv6/ipv6_sockglue.c | 22 ++++----- net/ipv6/mcast.c | 100 +++++++++++++++++++-------------------- net/ipv6/ndisc.c | 16 +++---- net/ipv6/output_core.c | 2 +- net/ipv6/proc.c | 2 +- net/ipv6/raw.c | 8 ++-- net/ipv6/reassembly.c | 6 +-- net/ipv6/route.c | 20 ++++---- net/ipv6/sit.c | 12 ++--- net/ipv6/tunnel6.c | 2 +- net/ipv6/udp.c | 18 +++---- net/ipv6/xfrm6_input.c | 4 +- net/ipv6/xfrm6_policy.c | 22 ++++----- net/ipv6/xfrm6_state.c | 14 +++--- net/ipv6/xfrm6_tunnel.c | 2 +- 31 files changed, 203 insertions(+), 203 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0b239fc1816e..267ce3caee24 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -180,7 +180,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, - .use_tempaddr = 0, + .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, .regen_max_retry = REGEN_MAX_RETRY, @@ -1105,8 +1105,8 @@ retry: spin_unlock_bh(&ifp->lock); regen_advance = idev->cnf.regen_max_retry * - idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + idev->cnf.dad_transmits * + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; write_unlock_bh(&idev->lock); /* A temporary address is created only if this calculated Preferred @@ -3035,7 +3035,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct hlist_head *h = &inet6_addr_lst[i]; spin_lock_bh(&addrconf_hash_lock); - restart: +restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { hlist_del_init_rcu(&ifa->addr_lst); @@ -3547,8 +3547,8 @@ static void __net_exit if6_proc_net_exit(struct net *net) } static struct pernet_operations if6_proc_net_ops = { - .init = if6_proc_net_init, - .exit = if6_proc_net_exit, + .init = if6_proc_net_init, + .exit = if6_proc_net_exit, }; int __init if6_proc_init(void) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2daa3a133e49..b9393e6a21fe 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,15 +7,15 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * Fixes: + * Fixes: * piggy, Karl Knutson : Socket protocol table - * Hideaki YOSHIFUJI : sin6_scope_id support - * Arnaldo Melo : check proc_net_create return, cleanups + * Hideaki YOSHIFUJI : sin6_scope_id support + * Arnaldo Melo : check proc_net_create return, cleanups * * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv6: " fmt diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 72a4930bdc0a..ac71d9e0a500 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -17,10 +17,10 @@ * Authors * * Mitsuru KANDA @USAGI : IPv6 Support - * Kazunori MIYAZAWA @USAGI : - * Kunihiro Ishiguro + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro * - * This file is derived from net/ipv4/ah.c. + * This file is derived from net/ipv4/ah.c. */ #define pr_fmt(fmt) "IPv6: " fmt @@ -284,7 +284,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) ipv6_rearrange_rthdr(iph, exthdr.rth); break; - default : + default: return 0; } @@ -478,7 +478,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err) auth_data = ah_tmp_auth(work_iph, hdr_len); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); - err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; @@ -622,7 +622,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) goto out_free; } - err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; @@ -647,8 +647,8 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); - struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; - struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+offset); struct xfrm_state *x; if (type != ICMPV6_PKT_TOOBIG && diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2753319524f1..1844e874a350 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -43,13 +43,13 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr, *final_p, final; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr, *final_p, final; struct dst_entry *dst; struct flowi6 fl6; struct ip6_flowlabel *flowlabel = NULL; - struct ipv6_txoptions *opt; + struct ipv6_txoptions *opt; int addr_type; int err; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index d15da1377149..7b6e830ebc6d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -17,10 +17,10 @@ * Authors * * Mitsuru KANDA @USAGI : IPv6 Support - * Kazunori MIYAZAWA @USAGI : - * Kunihiro Ishiguro + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro * - * This file is derived from net/ipv4/esp.c + * This file is derived from net/ipv4/esp.c */ #define pr_fmt(fmt) "IPv6: " fmt @@ -598,7 +598,7 @@ static int esp6_init_state(struct xfrm_state *x) case XFRM_MODE_BEET: if (x->sel.family != AF_INET6) x->props.header_len += IPV4_BEET_PHMAXLEN + - (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); break; case XFRM_MODE_TRANSPORT: break; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 8d67900aa003..bfde361b6134 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -142,7 +142,7 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) default: /* Other TLV code so scan list */ if (optlen > len) goto bad; - for (curr=procs; curr->type >= 0; curr++) { + for (curr = procs; curr->type >= 0; curr++) { if (curr->type == nh[off]) { /* type specific length/alignment checks will be performed in the diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 06ba3e58320b..394bb824fe4b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -503,7 +503,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) msg.type = type; len = skb->len - msg.offset; - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); if (len < 0) { LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); goto out_dst_release; @@ -636,7 +636,7 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (inner_offset<0) + if (inner_offset < 0) goto out; } else { inner_offset = sizeof(struct ipv6hdr); @@ -808,7 +808,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; fl6->daddr = *daddr; - fl6->flowi6_proto = IPPROTO_ICMPV6; + fl6->flowi6_proto = IPPROTO_ICMPV6; fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; @@ -875,8 +875,8 @@ static void __net_exit icmpv6_sk_exit(struct net *net) } static struct pernet_operations icmpv6_sk_ops = { - .init = icmpv6_sk_init, - .exit = icmpv6_sk_exit, + .init = icmpv6_sk_init, + .exit = icmpv6_sk_exit, }; int __init icmpv6_init(void) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index a245e5ddffbd..8c33a0b52b64 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -163,7 +163,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 826019008958..5f7927c06f18 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -6,7 +6,7 @@ * Generic INET6 transport hashtables * * Authors: Lotsa people, from code originally in tcp, generalised here - * by Arnaldo Carvalho de Melo + * by Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 4052694c6f2c..2ace4749bef4 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -136,7 +136,7 @@ static void ip6_fl_gc(unsigned long dummy) spin_lock(&ip6_fl_lock); - for (i=0; i<=FL_HASH_MASK; i++) { + for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; @@ -239,7 +239,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, /* Socket flowlabel lists */ -struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) +struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); @@ -293,11 +293,11 @@ void fl6_free_socklist(struct sock *sk) following rthdr. */ -struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, - struct ip6_flowlabel * fl, - struct ipv6_txoptions * fopt) +struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, + struct ip6_flowlabel *fl, + struct ipv6_txoptions *fopt) { - struct ipv6_txoptions * fl_opt = fl->opt; + struct ipv6_txoptions *fl_opt = fl->opt; if (fopt == NULL || fopt->opt_flen == 0) return fl_opt; @@ -388,7 +388,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, goto done; msg.msg_controllen = olen; - msg.msg_control = (void*)(fl->opt+1); + msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, @@ -517,7 +517,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; - struct ipv6_fl_socklist *sfl1=NULL; + struct ipv6_fl_socklist *sfl1 = NULL; struct ipv6_fl_socklist *sfl; struct ipv6_fl_socklist __rcu **sflp; struct ip6_flowlabel *fl, *fl1 = NULL; @@ -542,7 +542,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp))!=NULL; + (sfl = rcu_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 4578e23834f7..14dacc544c3e 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -13,7 +13,7 @@ static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? - 0 : -EBUSY; + 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 51d54dc376f3..a3084ab5df6c 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -15,8 +15,8 @@ */ /* Changes * - * Mitsuru KANDA @USAGI and - * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). */ #include @@ -65,7 +65,7 @@ int ip6_rcv_finish(struct sk_buff *skb) int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ipv6hdr *hdr; - u32 pkt_len; + u32 pkt_len; struct inet6_dev *idev; struct net *net = dev_net(skb->dev); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 65eda2a8af48..5bcda338bcef 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -244,7 +244,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, continue; iph2 = (struct ipv6hdr *)(p->data + off); - first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; + first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 315a55d66079..4ead5547b5c6 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -20,7 +20,7 @@ * etc. * * H. von Brand : Added missing #include - * Imran Patel : frag id should be in NBO + * Imran Patel : frag id should be in NBO * Kazunori MIYAZAWA @USAGI * : add ip6_append_data and related functions * for datagram xmit @@ -555,14 +555,14 @@ static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; int hroom, troom; __be32 frag_id = 0; - int ptr, offset = 0, err=0; + int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; struct net *net = dev_net(skb_dst(skb)->dev); @@ -637,7 +637,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } __skb_pull(skb, hlen); - fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); __skb_push(skb, hlen); skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); @@ -662,7 +662,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (frag) { frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); - fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr)); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), tmp_hdr, @@ -681,7 +681,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } err = output(skb); - if(!err) + if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); @@ -742,7 +742,7 @@ slow_path: /* * Keep copying data until we run out. */ - while(left > 0) { + while (left > 0) { len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) @@ -865,7 +865,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, - * and MSG_DONTROUTE --ANK (980726) + * and MSG_DONTROUTE --ANK (980726) * * 1. ip6_rt_check(): If route was host route, * check that cached destination is current. @@ -1049,7 +1049,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags, + int transhdrlen, int mtu, unsigned int flags, struct rt6_info *rt) { @@ -1072,7 +1072,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_reserve(skb, hh_len); /* create space for UDP/IP header */ - skb_put(skb,fragheaderlen + transhdrlen); + skb_put(skb, fragheaderlen + transhdrlen); /* initialize network header pointer */ skb_reset_network_header(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f9de5a695072..e01bd0399297 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -408,12 +408,12 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; __u8 nexthdr = ipv6h->nexthdr; - __u16 off = sizeof (*ipv6h); + __u16 off = sizeof(*ipv6h); while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { __u16 optlen = 0; struct ipv6_opt_hdr *hdr; - if (raw + off + sizeof (*hdr) > skb->data && + if (raw + off + sizeof(*hdr) > skb->data && !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) break; @@ -530,7 +530,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; - if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) { + if ((len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; @@ -991,7 +991,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - sizeof (*ipv6h); + mtu = dst_mtu(dst) - sizeof(*ipv6h); if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1083,7 +1083,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); @@ -1135,7 +1135,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); @@ -1229,11 +1229,11 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) if (rt->dst.dev) { dev->hard_header_len = rt->dst.dev->hard_header_len + - sizeof (struct ipv6hdr); + sizeof(struct ipv6hdr); - dev->mtu = rt->dst.dev->mtu - sizeof (struct ipv6hdr); + dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr); if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu-=8; + dev->mtu -= 8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1350,7 +1350,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } @@ -1362,7 +1362,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) { err = -EFAULT; } break; @@ -1372,7 +1372,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && @@ -1407,7 +1407,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); @@ -1482,11 +1482,11 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); - dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr); t = netdev_priv(dev); if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu-=8; + dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f9a3fd320d1d..0171f08325c3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -845,7 +845,7 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) atomic_dec(&mrt->cache_resolve_queue_len); - while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; @@ -1103,7 +1103,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, * Play the pending entries through our router */ - while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 0c289982796d..64177efe5cca 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -66,12 +66,12 @@ int ip6_ra_control(struct sock *sk, int sel) if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; - new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; write_lock_bh(&ip6_ra_lock); - for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { - if (sel>=0) { + if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; @@ -130,7 +130,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, int retv = -ENOPROTOOPT; if (optval == NULL) - val=0; + val = 0; else { if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) @@ -139,7 +139,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, val = 0; } - valbool = (val!=0); + valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); @@ -474,7 +474,7 @@ sticky_done: goto done; msg.msg_controllen = optlen; - msg.msg_control = (void*)(opt+1); + msg.msg_control = (void *)(opt+1); retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, &junk, &junk); @@ -687,7 +687,7 @@ done: retv = -ENOBUFS; break; } - gsf = kmalloc(optlen,GFP_KERNEL); + gsf = kmalloc(optlen, GFP_KERNEL); if (!gsf) { retv = -ENOBUFS; break; @@ -921,7 +921,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, if (!opt) return 0; - switch(optname) { + switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; @@ -1284,9 +1284,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval,&val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } @@ -1299,7 +1299,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); - if(level != SOL_IPV6) + if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 617f0958e164..70881795da96 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -232,7 +232,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_lock(&ipv6_sk_mc_lock); for (lnk = &np->ipv6_mc_list; (mc_lst = rcu_dereference_protected(*lnk, - lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ; + lockdep_is_held(&ipv6_sk_mc_lock))) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { @@ -390,7 +390,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; - for (i=0; isl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; @@ -407,7 +407,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface filter */ ip6_mc_del_src(idev, group, omode, 1, source, 1); - for (j=i+1; jsl_count; j++) + for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; @@ -433,19 +433,19 @@ int ip6_mc_source(int add, int omode, struct sock *sk, newpsl->sl_max = count; newpsl->sl_count = count - IP6_SFBLOCK; if (psl) { - for (i=0; isl_count; i++) + for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); } pmc->sflist = psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ - for (i=0; isl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) /* There is an error in the address. */ goto done; } - for (j=psl->sl_count-1; j>=i; j--) + for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; psl->sl_count++; @@ -514,7 +514,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; - for (i=0; isl_count; ++i) { + for (i = 0; i < newpsl->sl_count; ++i) { struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; @@ -606,7 +606,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We * have the socket lock, so reading here is safe. */ - for (i=0; isl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) break; } @@ -762,7 +762,7 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) pmc->mca_tomb = im->mca_tomb; pmc->mca_sources = im->mca_sources; im->mca_tomb = im->mca_sources = NULL; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->mca_crcount; } spin_unlock_bh(&im->mca_lock); @@ -780,7 +780,7 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; - for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { + for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; @@ -794,7 +794,7 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) spin_unlock_bh(&idev->mc_lock); if (pmc) { - for (psf=pmc->mca_tomb; psf; psf=psf_next) { + for (psf = pmc->mca_tomb; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -821,14 +821,14 @@ static void mld_clear_delrec(struct inet6_dev *idev) /* clear dead sources, too */ read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { struct ip6_sf_list *psf, *psf_next; spin_lock_bh(&pmc->mca_lock); psf = pmc->mca_tomb; pmc->mca_tomb = NULL; spin_unlock_bh(&pmc->mca_lock); - for (; psf; psf=psf_next) { + for (; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -917,7 +917,7 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) struct ifmcaddr6 *ma, **map; write_lock_bh(&idev->lock); - for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; @@ -968,7 +968,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); - for (mc = idev->mc_list; mc; mc=mc->next) { + for (mc = idev->mc_list; mc; mc = mc->next) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } @@ -977,7 +977,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, struct ip6_sf_list *psf; spin_lock_bh(&mc->mca_lock); - for (psf=mc->mca_sources;psf;psf=psf->sf_next) { + for (psf = mc->mca_sources; psf; psf = psf->sf_next) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } @@ -986,7 +986,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, psf->sf_count[MCAST_EXCLUDE] != mc->mca_sfcount[MCAST_EXCLUDE]; else - rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; + rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; spin_unlock_bh(&mc->mca_lock); } else rv = true; /* don't filter unspecified source */ @@ -1077,10 +1077,10 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, int i, scount; scount = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; isf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != @@ -1110,10 +1110,10 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, /* mark INCLUDE-mode sources */ scount = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; isf_addr)) { psf->sf_gsresp = 1; scount++; @@ -1364,13 +1364,13 @@ int igmp6_event_query(struct sk_buff *skb) read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { - for (ma = idev->mc_list; ma; ma=ma->next) { + for (ma = idev->mc_list; ma; ma = ma->next) { spin_lock_bh(&ma->mca_lock); igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); } } else { - for (ma = idev->mc_list; ma; ma=ma->next) { + for (ma = idev->mc_list; ma; ma = ma->next) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; spin_lock_bh(&ma->mca_lock); @@ -1434,7 +1434,7 @@ int igmp6_event_report(struct sk_buff *skb) */ read_lock_bh(&idev->lock); - for (ma = idev->mc_list; ma; ma=ma->next) { + for (ma = idev->mc_list; ma; ma = ma->next) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { spin_lock(&ma->mca_lock); if (del_timer(&ma->mca_timer)) @@ -1498,7 +1498,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) struct ip6_sf_list *psf; int scount = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -1712,7 +1712,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } first = 1; psf_prev = NULL; - for (psf=*psf_list; psf; psf=psf_next) { + for (psf = *psf_list; psf; psf = psf_next) { struct in6_addr *psrc; psf_next = psf->sf_next; @@ -1791,7 +1791,7 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) read_lock_bh(&idev->lock); if (!pmc) { - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { if (pmc->mca_flags & MAF_NOREPORT) continue; spin_lock_bh(&pmc->mca_lock); @@ -1824,7 +1824,7 @@ static void mld_clear_zeros(struct ip6_sf_list **ppsf) struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf=*ppsf; psf; psf = psf_next) { + for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) @@ -1848,7 +1848,7 @@ static void mld_send_cr(struct inet6_dev *idev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) { + for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; @@ -1881,7 +1881,7 @@ static void mld_send_cr(struct inet6_dev *idev) spin_unlock(&idev->mc_lock); /* change recs */ - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; @@ -2018,7 +2018,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) skb = NULL; read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; @@ -2063,7 +2063,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2104,7 +2104,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } @@ -2124,7 +2124,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfcount[sfmode]--; } err = 0; - for (i=0; i 0; @@ -2140,7 +2140,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) @@ -2159,7 +2159,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2184,7 +2184,7 @@ static void sf_markstate(struct ifmcaddr6 *pmc) struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -2201,7 +2201,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) int new_in, rv; rv = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -2211,8 +2211,8 @@ static int sf_setstate(struct ifmcaddr6 *pmc) if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; - for (dpsf=pmc->mca_tomb; dpsf; - dpsf=dpsf->sf_next) { + for (dpsf = pmc->mca_tomb; dpsf; + dpsf = dpsf->sf_next) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2234,7 +2234,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf=pmc->mca_tomb; dpsf; dpsf=dpsf->sf_next) + for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2268,7 +2268,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } @@ -2284,7 +2284,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!delta) pmc->mca_sfcount[sfmode]++; err = 0; - for (i=0; imca_sfcount[sfmode]--; - for (j=0; jmca_sfcount[MCAST_EXCLUDE] != 0)) { struct ip6_sf_list *psf; @@ -2308,7 +2308,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) @@ -2322,12 +2322,12 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; - for (psf=pmc->mca_tomb; psf; psf=nextpsf) { + for (psf = pmc->mca_tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } pmc->mca_tomb = NULL; - for (psf=pmc->mca_sources; psf; psf=nextpsf) { + for (psf = pmc->mca_sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } @@ -2471,7 +2471,7 @@ void ipv6_mc_down(struct inet6_dev *idev) mld_gq_stop_timer(idev); mld_dad_stop_timer(idev); - for (i = idev->mc_list; i; i=i->next) + for (i = idev->mc_list; i; i = i->next) igmp6_group_dropped(i); read_unlock_bh(&idev->lock); @@ -2488,7 +2488,7 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ read_lock_bh(&idev->lock); - for (i = idev->mc_list; i; i=i->next) + for (i = idev->mc_list; i; i = i->next) igmp6_group_added(i); read_unlock_bh(&idev->lock); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 339078f95d1b..995a82945994 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -175,7 +175,7 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while(cur < end && cur->nd_opt_type != type); + } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } @@ -192,7 +192,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while(cur < end && !ndisc_is_useropt(cur)); + } while (cur < end && !ndisc_is_useropt(cur)); return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; } @@ -296,7 +296,7 @@ static u32 ndisc_hash(const void *pkey, static int ndisc_constructor(struct neighbour *neigh) { - struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; @@ -344,7 +344,7 @@ static int ndisc_constructor(struct neighbour *neigh) static int pndisc_constructor(struct pneigh_entry *n) { - struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; @@ -357,7 +357,7 @@ static int pndisc_constructor(struct pneigh_entry *n) static void pndisc_destructor(struct pneigh_entry *n) { - struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; @@ -1065,7 +1065,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) int optlen; unsigned int pref = 0; - __u8 * opt = (__u8 *)(ra_msg + 1); + __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); @@ -1319,7 +1319,7 @@ skip_linkparms: continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; - rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, + rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } @@ -1352,7 +1352,7 @@ skip_routeinfo: __be32 n; u32 mtu; - memcpy(&n, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); + memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 5ec867e4a8b7..fc24c390af05 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -35,7 +35,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) if (found_rhdr) return offset; break; - default : + default: return offset; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 2d6f860e5c1e..1752cd0b4882 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -8,7 +8,7 @@ * except it reports the sockets in the INET6 address family. * * Authors: David S. Miller (davem@caip.rutgers.edu) - * YOSHIFUJI Hideaki + * YOSHIFUJI Hideaki * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 39d44226e402..896af8807979 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -889,7 +889,7 @@ back_from_confirm: else { lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, - len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, + len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, msg->msg_flags, dontfrag); if (err) @@ -902,7 +902,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); - return err<0?err:len; + return err < 0 ? err : len; do_confirm: dst_confirm(dst); if (!(msg->msg_flags & MSG_PROBE) || len) @@ -1045,7 +1045,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, struct raw6_sock *rp = raw6_sk(sk); int val, len; - if (get_user(len,optlen)) + if (get_user(len, optlen)) return -EFAULT; switch (optname) { @@ -1069,7 +1069,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval,&val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index c6557d9f7808..fe156d99179e 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -68,7 +68,7 @@ struct ip6frag_skb_cb int offset; }; -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -289,7 +289,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, goto found; } prev = NULL; - for(next = fq->q.fragments; next != NULL; next = next->next) { + for (next = fq->q.fragments; next != NULL; next = next->next) { if (FRAG6_CB(next)->offset >= offset) break; /* bingo! */ prev = next; @@ -529,7 +529,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ - if (hdr->payload_len==0) + if (hdr->payload_len == 0) goto fail_hdr; if (!pskb_may_pull(skb, (skb_transport_offset(skb) + diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f23fbd28a501..76c793096893 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -813,7 +813,7 @@ out: } -struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6, +struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, int flags) { return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); @@ -1024,7 +1024,7 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, +struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { int flags = 0; @@ -1149,7 +1149,7 @@ static void ip6_link_failure(struct sk_buff *skb) static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { - struct rt6_info *rt6 = (struct rt6_info*)dst; + struct rt6_info *rt6 = (struct rt6_info *)dst; dst_confirm(dst); if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { @@ -1924,7 +1924,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); if (!fn) goto out; @@ -1983,7 +1983,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev return NULL; read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) @@ -2068,7 +2068,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) struct in6_rtmsg rtmsg; int err; - switch(cmd) { + switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -2191,7 +2191,7 @@ int ip6_route_get_saddr(struct net *net, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); int err = 0; if (rt->rt6i_prefsrc.plen) *saddr = rt->rt6i_prefsrc.addr; @@ -2486,7 +2486,7 @@ beginning: return last_err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct fib6_config cfg; int err; @@ -2501,7 +2501,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) return ip6_route_del(&cfg); } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct fib6_config cfg; int err; @@ -2693,7 +2693,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) prefix, 0, NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 6163f851dc01..86e3fa81f85e 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -812,9 +812,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - unsigned int max_headroom; /* The extra header space needed */ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; @@ -1123,7 +1123,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, #endif static int -ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; @@ -1339,10 +1339,10 @@ static void ipip6_dev_free(struct net_device *dev) static void ipip6_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip6_netdev_ops; - dev->destructor = ipip6_dev_free; + dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 2c4e4c5c7614..7a53d397c739 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -15,7 +15,7 @@ * along with this program; if not, see . * * Authors Mitsuru KANDA - * YOSHIFUJI Hideaki + * YOSHIFUJI Hideaki */ #define pr_fmt(fmt) "IPv6: " fmt diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 25ffe737b975..12fcce8fba46 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -373,8 +373,8 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup); /* - * This should be easy, if there is something there we - * return it, otherwise we block. + * This should be easy, if there is something there we + * return it, otherwise we block. */ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, @@ -530,7 +530,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct in6_addr *saddr = &hdr->saddr; const struct in6_addr *daddr = &hdr->daddr; - struct udphdr *uh = (struct udphdr*)(skb->data+offset); + struct udphdr *uh = (struct udphdr *)(skb->data+offset); struct sock *sk; int err; struct net *net = dev_net(skb->dev); @@ -596,7 +596,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static __inline__ void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, - u8 code, int offset, __be32 info ) + u8 code, int offset, __be32 info) { __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } @@ -960,10 +960,10 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } /** - * udp6_hwcsum_outgoing - handle outgoing HW checksumming - * @sk: socket we are sending on - * @skb: sk_buff containing the filled-in UDP header - * (checksum field must be zeroed out) + * udp6_hwcsum_outgoing - handle outgoing HW checksumming + * @sk: socket we are sending on + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) */ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, const struct in6_addr *saddr, @@ -1294,7 +1294,7 @@ do_append_data: getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), hlimit, tclass, opt, &fl6, - (struct rt6_info*)dst, + (struct rt6_info *)dst, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); if (err) udp_v6_flush_pending_frames(sk); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index f8c3cf842f53..cd4c98c7c805 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -3,8 +3,8 @@ * * Authors: * Mitsuru KANDA @USAGI - * Kazunori MIYAZAWA @USAGI - * Kunihiro Ishiguro + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro * YOSHIFUJI Hideaki @USAGI * IPv6 support */ diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 2a0bbda2c76a..ac49f84fe2c3 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -3,11 +3,11 @@ * * Authors: * Mitsuru KANDA @USAGI - * Kazunori MIYAZAWA @USAGI - * Kunihiro Ishiguro - * IPv6 support - * YOSHIFUJI Hideaki - * Split up af-specific portion + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion * */ @@ -84,7 +84,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { - struct rt6_info *rt = (struct rt6_info*)dst; + struct rt6_info *rt = (struct rt6_info *)dst; if (rt->rt6i_node) path->path_cookie = rt->rt6i_node->fn_sernum; } @@ -97,7 +97,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rt6_info *rt = (struct rt6_info*)xdst->route; + struct rt6_info *rt = (struct rt6_info *)xdst->route; xdst->u.dst.dev = dev; dev_hold(dev); @@ -296,7 +296,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, - .get_saddr = xfrm6_get_saddr, + .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, .init_dst = xfrm6_init_dst, @@ -319,9 +319,9 @@ static void xfrm6_policy_fini(void) static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", - .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, - .maxlen = sizeof(int), - .mode = 0644, + .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, { } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 3fc970135fc6..8a1f9c0d2a13 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -3,11 +3,11 @@ * * Authors: * Mitsuru KANDA @USAGI - * Kazunori MIYAZAWA @USAGI - * Kunihiro Ishiguro - * IPv6 support - * YOSHIFUJI Hideaki @USAGI - * Split up af-specific portion + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion * */ @@ -45,10 +45,10 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr) { x->id = tmpl->id; - if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); - if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); x->props.mode = tmpl->mode; x->props.reqid = tmpl->reqid; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 1c66465a42dd..7b2508be24fd 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -15,7 +15,7 @@ * along with this program; if not, see . * * Authors Mitsuru KANDA - * YOSHIFUJI Hideaki + * YOSHIFUJI Hideaki * * Based on net/ipv4/xfrm4_tunnel.c * -- cgit v1.2.3 From cc24becae3e87d7aa8238f4fcb29bfb68f7ffb97 Mon Sep 17 00:00:00 2001 From: Ian Morris Date: Sun, 24 Aug 2014 21:53:11 +0100 Subject: ipv6: White-space cleansing : Structure layouts This patch makes no changes to the logic of the code but simply addresses coding style issues as detected by checkpatch. Both objdump and diff -w show no differences. This patch addresses structure definitions, specifically it cleanses the brace placement and replaces spaces with tabs in a few places. Signed-off-by: Ian Morris Signed-off-by: David S. Miller --- net/ipv6/ah6.c | 5 ++--- net/ipv6/esp6.c | 7 +++---- net/ipv6/ipcomp6.c | 6 ++---- net/ipv6/mip6.c | 10 ++++------ net/ipv6/reassembly.c | 6 ++---- 5 files changed, 13 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index ac71d9e0a500..fcffd4e522c8 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -755,11 +755,10 @@ static int ah6_rcv_cb(struct sk_buff *skb, int err) return 0; } -static const struct xfrm_type ah6_type = -{ +static const struct xfrm_type ah6_type = { .description = "AH6", .owner = THIS_MODULE, - .proto = IPPROTO_AH, + .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah6_init_state, .destructor = ah6_destroy, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7b6e830ebc6d..83fc3a385a26 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -621,11 +621,10 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err) return 0; } -static const struct xfrm_type esp6_type = -{ +static const struct xfrm_type esp6_type = { .description = "ESP6", - .owner = THIS_MODULE, - .proto = IPPROTO_ESP, + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp6_init_state, .destructor = esp6_destroy, diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index d1c793cffcb5..1b9316e1386a 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -181,8 +181,7 @@ static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) return 0; } -static const struct xfrm_type ipcomp6_type = -{ +static const struct xfrm_type ipcomp6_type = { .description = "IPCOMP6", .owner = THIS_MODULE, .proto = IPPROTO_COMP, @@ -193,8 +192,7 @@ static const struct xfrm_type ipcomp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static struct xfrm6_protocol ipcomp6_protocol = -{ +static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index db9b6cbc9db3..f61429d391d3 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -336,11 +336,10 @@ static void mip6_destopt_destroy(struct xfrm_state *x) { } -static const struct xfrm_type mip6_destopt_type = -{ +static const struct xfrm_type mip6_destopt_type = { .description = "MIP6DESTOPT", .owner = THIS_MODULE, - .proto = IPPROTO_DSTOPTS, + .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, .init_state = mip6_destopt_init_state, .destructor = mip6_destopt_destroy, @@ -469,11 +468,10 @@ static void mip6_rthdr_destroy(struct xfrm_state *x) { } -static const struct xfrm_type mip6_rthdr_type = -{ +static const struct xfrm_type mip6_rthdr_type = { .description = "MIP6RT", .owner = THIS_MODULE, - .proto = IPPROTO_ROUTING, + .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, .init_state = mip6_rthdr_init_state, .destructor = mip6_rthdr_destroy, diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index fe156d99179e..1a157ca2ebc1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -62,8 +62,7 @@ static const char ip6_frag_cache_name[] = "ip6-frags"; -struct ip6frag_skb_cb -{ +struct ip6frag_skb_cb { struct inet6_skb_parm h; int offset; }; @@ -575,8 +574,7 @@ fail_hdr: return -1; } -static const struct inet6_protocol frag_protocol = -{ +static const struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, .flags = INET6_PROTO_NOPOLICY, }; -- cgit v1.2.3 From 4c83acbc565d53296f1731034c5041a0fbabcaeb Mon Sep 17 00:00:00 2001 From: Ian Morris Date: Sun, 24 Aug 2014 21:53:12 +0100 Subject: ipv6: White-space cleansing : gaps between function and symbol export This patch makes no changes to the logic of the code but simply addresses coding style issues as detected by checkpatch. Both objdump and diff -w show no differences. This patch removes some blank lines between the end of a function definition and the EXPORT_SYMBOL_GPL macro in order to prevent checkpatch warning that EXPORT_SYMBOL must immediately follow a function. Signed-off-by: Ian Morris Signed-off-by: David S. Miller --- net/ipv6/inet6_connection_sock.c | 4 ---- net/ipv6/inet6_hashtables.c | 3 --- net/ipv6/ip6_flowlabel.c | 1 - net/ipv6/ip6_output.c | 1 - net/ipv6/ipv6_sockglue.c | 4 ---- net/ipv6/ndisc.c | 1 - net/ipv6/route.c | 2 -- net/ipv6/tunnel6.c | 2 -- net/ipv6/xfrm6_input.c | 2 -- net/ipv6/xfrm6_output.c | 1 - net/ipv6/xfrm6_tunnel.c | 2 -- 11 files changed, 23 deletions(-) (limited to 'net') diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 8c33a0b52b64..29b32206e494 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -63,7 +63,6 @@ int inet6_csk_bind_conflict(const struct sock *sk, return sk2 != NULL; } - EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); struct dst_entry *inet6_csk_route_req(struct sock *sk, @@ -144,7 +143,6 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk, return NULL; } - EXPORT_SYMBOL_GPL(inet6_csk_search_req); void inet6_csk_reqsk_queue_hash_add(struct sock *sk, @@ -160,7 +158,6 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); inet_csk_reqsk_queue_added(sk, timeout); } - EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) @@ -175,7 +172,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, sk->sk_bound_dev_if); } - EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 5f7927c06f18..051dffb49c90 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -222,7 +222,6 @@ begin: rcu_read_unlock(); return result; } - EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, @@ -238,7 +237,6 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, return sk; } - EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, @@ -324,5 +322,4 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), __inet6_check_established, __inet6_hash); } - EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 2ace4749bef4..3dd7d4ebd7cd 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -259,7 +259,6 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) rcu_read_unlock_bh(); return NULL; } - EXPORT_SYMBOL_GPL(fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4ead5547b5c6..b7a3e7b3378e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -233,7 +233,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EMSGSIZE; } - EXPORT_SYMBOL(ip6_xmit); static int ip6_call_ra_chain(struct sk_buff *skb, int sel) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 64177efe5cca..e1a9583bb419 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -873,7 +873,6 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(ipv6_setsockopt); #ifdef CONFIG_COMPAT @@ -909,7 +908,6 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(compat_ipv6_setsockopt); #endif @@ -1321,7 +1319,6 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(ipv6_getsockopt); #ifdef CONFIG_COMPAT @@ -1364,7 +1361,6 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(compat_ipv6_getsockopt); #endif diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 995a82945994..4cb45c1079a2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -284,7 +284,6 @@ int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, } return -EINVAL; } - EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 76c793096893..f74b0417bd60 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -843,7 +843,6 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, return NULL; } - EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. @@ -1041,7 +1040,6 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } - EXPORT_SYMBOL(ip6_route_output); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 7a53d397c739..3c758007b327 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -64,7 +64,6 @@ err: return ret; } - EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) @@ -92,7 +91,6 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) return ret; } - EXPORT_SYMBOL(xfrm6_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index cd4c98c7c805..f48fbe4d16f5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -52,7 +52,6 @@ int xfrm6_rcv(struct sk_buff *skb) return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0); } - EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, @@ -142,5 +141,4 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, drop: return -1; } - EXPORT_SYMBOL(xfrm6_input_addr); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 433672d07d0b..ca3f29b98ae5 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -25,7 +25,6 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, { return ip6_find_1stfragopt(skb, prevhdr); } - EXPORT_SYMBOL(xfrm6_find_1stfragopt); static int xfrm6_local_dontfrag(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 7b2508be24fd..5743044cd660 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -110,7 +110,6 @@ __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) rcu_read_unlock_bh(); return htonl(spi); } - EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) @@ -187,7 +186,6 @@ __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) return htonl(spi); } - EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); static void x6spi_destroy_rcu(struct rcu_head *head) -- cgit v1.2.3 From 4798248e4e023170e937a65a1d30fcc52496dd42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 22 Aug 2014 16:21:53 -0700 Subject: net: Add ops->ndo_xmit_flush() Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 2 +- drivers/usb/gadget/function/f_ncm.c | 2 +- include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++++++++ net/atm/mpc.c | 2 +- net/core/dev.c | 5 ++--- net/core/netpoll.c | 3 +-- net/core/pktgen.c | 4 +--- net/packet/af_packet.c | 3 +-- net/sched/sch_teql.c | 3 +-- 9 files changed, 44 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 43c9960dce1c..81b22a180aad 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -193,7 +193,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) struct dlci_local *dlp = netdev_priv(dev); if (skb) - dlp->slave->netdev_ops->ndo_start_xmit(skb, dlp->slave); + netdev_start_xmit(skb, dlp->slave); return NETDEV_TX_OK; } diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index bcdc882cd415..cb5d646db6a7 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1101,7 +1101,7 @@ static void ncm_tx_tasklet(unsigned long data) /* Only send if data is available. */ if (ncm->skb_tx_data) { ncm->timer_force_tx = true; - ncm->netdev->netdev_ops->ndo_start_xmit(NULL, ncm->netdev); + netdev_start_xmit(NULL, ncm->netdev); ncm->timer_force_tx = false; } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eb73444e1bd0..220c50984688 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -782,6 +782,19 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * + * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); + * A driver implements this function when it wishes to support + * deferred TX queue flushing. The idea is that the expensive + * operation to trigger TX queue processing can be done after + * N calls to ndo_start_xmit rather than being done every single + * time. In this regime ndo_start_xmit will be called one or more + * times, and then a final ndo_xmit_flush call will be made to + * have the driver tell the device about the new pending TX queue + * entries. The kernel keeps track of which queues need flushing + * by monitoring skb->queue_mapping of the packets it submits to + * ndo_start_xmit. This is the queue value that will be passed + * to ndo_xmit_flush. + * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -1005,6 +1018,7 @@ struct net_device_ops { int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); + void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -3430,6 +3444,27 @@ int __init dev_proc_init(void); #define dev_proc_init() 0 #endif +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, + struct sk_buff *skb, struct net_device *dev) +{ + netdev_tx_t ret; + u16 q; + + q = skb->queue_mapping; + ret = ops->ndo_start_xmit(skb, dev); + if (dev_xmit_complete(ret) && ops->ndo_xmit_flush) + ops->ndo_xmit_flush(dev, q); + + return ret; +} + +static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return __netdev_start_xmit(ops, skb, dev); +} + int netdev_class_create_file_ns(struct class_attribute *class_attr, const void *ns); void netdev_class_remove_file_ns(struct class_attribute *class_attr, diff --git a/net/atm/mpc.c b/net/atm/mpc.c index e8e0e7a8a23d..d662da161e5a 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return mpc->old_ops->ndo_start_xmit(skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/core/dev.c b/net/core/dev.c index b6a718ec11c1..26d296c2447c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2602,7 +2602,6 @@ EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; unsigned int skb_len; @@ -2667,7 +2666,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = ops->ndo_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); if (rc == NETDEV_TX_OK) txq_trans_update(txq); @@ -2686,7 +2685,7 @@ gso: skb_len = nskb->len; trace_net_dev_start_xmit(nskb, dev); - rc = ops->ndo_start_xmit(nskb, dev); + rc = netdev_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 907fb5e36c02..a5ad06828d67 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644); static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int status = NETDEV_TX_OK; netdev_features_t features; @@ -92,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = ops->ndo_start_xmit(skb, dev); + status = netdev_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b849ddfef2e..83e2b4b19eb7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3285,8 +3285,6 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; u16 queue_map; int ret; @@ -3339,7 +3337,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev); switch (ret) { case NETDEV_TX_OK: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 93896d2092f6..0dfa990d4eaa 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -240,7 +240,6 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; @@ -262,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = ops->ndo_start_xmit(skb, dev); + ret = netdev_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) txq_trans_update(txq); } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index bd33793b527e..64cd93ca8104 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -301,7 +301,6 @@ restart: do { struct net_device *slave = qdisc_dev(q); struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); - const struct net_device_ops *slave_ops = slave->netdev_ops; if (slave_txq->qdisc_sleeping != q) continue; @@ -317,7 +316,7 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { + netdev_start_xmit(skb, slave) == NETDEV_TX_OK) { txq_trans_update(slave_txq); __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); -- cgit v1.2.3 From a796dac9a6bedff6db99f57828c85c97071d3d1e Mon Sep 17 00:00:00 2001 From: Tomasz Bursztyka Date: Wed, 13 Aug 2014 16:04:51 +0300 Subject: wireless: core: Reorder wiphy_register() notifications relevantly Currently it can send regulatory domain change notification before any NEW_WIPHY notification. Moreover, if rfill_register() fails, calling wiphy_unregister() will send a DEL_WIPHY though no NEW_WIPHY had been sent previously. Thus reordering so it properly notifies NEW_WIPHY before any other. Signed-off-by: Tomasz Bursztyka Signed-off-by: John W. Linville --- net/wireless/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index afee5e0455ea..682babde4aa5 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -635,6 +635,9 @@ int wiphy_register(struct wiphy *wiphy) if (IS_ERR(rdev->wiphy.debugfsdir)) rdev->wiphy.debugfsdir = NULL; + cfg80211_debugfs_rdev_add(rdev); + nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request; @@ -646,8 +649,6 @@ int wiphy_register(struct wiphy *wiphy) nl80211_send_reg_change_event(&request); } - cfg80211_debugfs_rdev_add(rdev); - rdev->wiphy.registered = true; rtnl_unlock(); @@ -659,8 +660,6 @@ int wiphy_register(struct wiphy *wiphy) return res; } - nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); - return 0; } EXPORT_SYMBOL(wiphy_register); -- cgit v1.2.3 From 6451b3f59ab39162d1fbb5a5d0c8f46c0d9e1231 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Aug 2014 17:03:46 -0700 Subject: net: fix comments for __skb_flow_get_ports() Fixes: commit 690e36e726d00d2 (net: Allow raw buffers to be passed into the flow dissector) Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 8ffcc97871c8..ae8f0db67aed 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -26,10 +26,12 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i } /** - * skb_flow_get_ports - extract the upper layer ports and return them - * @skb: buffer to extract the ports from + * __skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset -- cgit v1.2.3 From 453a940ea725d692282f9e66475cec0d1b1e12f2 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Aug 2014 17:03:47 -0700 Subject: net: make skb an optional parameter for__skb_flow_dissect() Fixes: commit 690e36e726d00d2 (net: Allow raw buffers to be passed into the flow dissector) Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/flow_keys.h | 4 ++-- net/core/flow_dissector.c | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 4040f63932c5..9a03f73c4974 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -28,10 +28,10 @@ struct flow_keys { }; bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, - void *data, int hlen); + void *data, __be16 proto, int nhoff, int hlen); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) { - return __skb_flow_dissect(skb, flow, NULL, 0); + return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0); } __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, void *data, int hlen_proto); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ae8f0db67aed..12f48ca7a0b0 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -59,14 +59,26 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, void *data, int hlen) +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the struct flow_keys from either the skbuff + * or a raw buffer specified by the rest parameters + */ +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, __be16 proto, int nhoff, int hlen) { - int nhoff = skb_network_offset(skb); u8 ip_proto; - __be16 proto = skb->protocol; if (!data) { data = skb->data; + proto = skb->protocol; + nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); } -- cgit v1.2.3 From f8134fed8346c05007ca23e22e1504097afcd7c9 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Sat, 28 Jun 2014 16:35:26 -0400 Subject: mac80211: mesh_plink: use get_unaligned_le16 instead of memcpy Use get_unaligned_le16 to access llid/plid. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh_plink.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 8d67c1ebfbda..8f0887fc7128 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1001,7 +1001,6 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, enum ieee80211_self_protected_actioncode ftype; u32 changed = 0; u8 ie_len = elems->peering_len; - __le16 _plid, _llid; u16 plid, llid = 0; if (!elems->peering) { @@ -1036,13 +1035,10 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, /* Note the lines below are correct, the llid in the frame is the plid * from the point of view of this host. */ - memcpy(&_plid, PLINK_GET_LLID(elems->peering), sizeof(__le16)); - plid = le16_to_cpu(_plid); + plid = get_unaligned_le16(PLINK_GET_LLID(elems->peering)); if (ftype == WLAN_SP_MESH_PEERING_CONFIRM || - (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len == 8)) { - memcpy(&_llid, PLINK_GET_PLID(elems->peering), sizeof(__le16)); - llid = le16_to_cpu(_llid); - } + (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len == 8)) + llid = get_unaligned_le16(PLINK_GET_PLID(elems->peering)); /* WARNING: Only for sta pointer, is dropped & re-acquired */ rcu_read_lock(); -- cgit v1.2.3 From 649b2a4da5d8a39c4d7be2fd228cf797819656f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jul 2014 15:01:59 +0200 Subject: mac80211: make ieee80211_vif_use_reserved_switch static Reorder some code to make ieee80211_vif_use_reserved_switch() static, no other changes. Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 187 ++++++++++++++++++++++----------------------- net/mac80211/ieee80211_i.h | 1 - 2 files changed, 93 insertions(+), 95 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 6d537f03c0ba..792eac6cc7b3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -637,41 +637,6 @@ out: return ret; } -static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *conf; - struct ieee80211_chanctx *ctx; - bool use_reserved_switch = false; - - lockdep_assert_held(&local->chanctx_mtx); - - conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (!conf) - return; - - ctx = container_of(conf, struct ieee80211_chanctx, conf); - - if (sdata->reserved_chanctx) { - if (sdata->reserved_chanctx->replace_state == - IEEE80211_CHANCTX_REPLACES_OTHER && - ieee80211_chanctx_num_reserved(local, - sdata->reserved_chanctx) > 1) - use_reserved_switch = true; - - ieee80211_vif_unreserve_chanctx(sdata); - } - - ieee80211_assign_vif_chanctx(sdata, NULL); - if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); - - /* Unreserving may ready an in-place reservation. */ - if (use_reserved_switch) - ieee80211_vif_use_reserved_switch(local); -} - void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { @@ -762,63 +727,6 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RX_CHAINS); } -int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, - const struct cfg80211_chan_def *chandef, - enum ieee80211_chanctx_mode mode) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx *ctx; - u8 radar_detect_width = 0; - int ret; - - lockdep_assert_held(&local->mtx); - - WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev)); - - mutex_lock(&local->chanctx_mtx); - - ret = cfg80211_chandef_dfs_required(local->hw.wiphy, - chandef, - sdata->wdev.iftype); - if (ret < 0) - goto out; - if (ret > 0) - radar_detect_width = BIT(chandef->width); - - sdata->radar_required = ret; - - ret = ieee80211_check_combinations(sdata, chandef, mode, - radar_detect_width); - if (ret < 0) - goto out; - - __ieee80211_vif_release_channel(sdata); - - ctx = ieee80211_find_chanctx(local, chandef, mode); - if (!ctx) - ctx = ieee80211_new_chanctx(local, chandef, mode); - if (IS_ERR(ctx)) { - ret = PTR_ERR(ctx); - goto out; - } - - sdata->vif.bss_conf.chandef = *chandef; - - ret = ieee80211_assign_vif_chanctx(sdata, ctx); - if (ret) { - /* if assign fails refcount stays the same */ - if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); - goto out; - } - - ieee80211_recalc_smps_chanctx(local, ctx); - ieee80211_recalc_radar_chanctx(local, ctx); - out: - mutex_unlock(&local->chanctx_mtx); - return ret; -} - static void __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, bool clear) @@ -1267,8 +1175,7 @@ err: return err; } -int -ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) +static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *sdata_tmp; struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; @@ -1520,6 +1427,98 @@ err: return err; } +static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + bool use_reserved_switch = false; + + lockdep_assert_held(&local->chanctx_mtx); + + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) + return; + + ctx = container_of(conf, struct ieee80211_chanctx, conf); + + if (sdata->reserved_chanctx) { + if (sdata->reserved_chanctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER && + ieee80211_chanctx_num_reserved(local, + sdata->reserved_chanctx) > 1) + use_reserved_switch = true; + + ieee80211_vif_unreserve_chanctx(sdata); + } + + ieee80211_assign_vif_chanctx(sdata, NULL); + if (ieee80211_chanctx_refcount(local, ctx) == 0) + ieee80211_free_chanctx(local, ctx); + + /* Unreserving may ready an in-place reservation. */ + if (use_reserved_switch) + ieee80211_vif_use_reserved_switch(local); +} + +int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx *ctx; + u8 radar_detect_width = 0; + int ret; + + lockdep_assert_held(&local->mtx); + + WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev)); + + mutex_lock(&local->chanctx_mtx); + + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, + chandef, + sdata->wdev.iftype); + if (ret < 0) + goto out; + if (ret > 0) + radar_detect_width = BIT(chandef->width); + + sdata->radar_required = ret; + + ret = ieee80211_check_combinations(sdata, chandef, mode, + radar_detect_width); + if (ret < 0) + goto out; + + __ieee80211_vif_release_channel(sdata); + + ctx = ieee80211_find_chanctx(local, chandef, mode); + if (!ctx) + ctx = ieee80211_new_chanctx(local, chandef, mode); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); + goto out; + } + + sdata->vif.bss_conf.chandef = *chandef; + + ret = ieee80211_assign_vif_chanctx(sdata, ctx); + if (ret) { + /* if assign fails refcount stays the same */ + if (ieee80211_chanctx_refcount(local, ctx) == 0) + ieee80211_free_chanctx(local, ctx); + goto out; + } + + ieee80211_recalc_smps_chanctx(local, ctx); + ieee80211_recalc_radar_chanctx(local, ctx); + out: + mutex_unlock(&local->chanctx_mtx); + return ret; +} + int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ef7a089ac546..ffb20e5e6cf3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1869,7 +1869,6 @@ ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, int __must_check ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata); int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata); -int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local); int __must_check ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From 970fdfa89babb5a6f1a3d345e8cb54d92c1e3a8f Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Mon, 11 Aug 2014 03:29:57 -0700 Subject: cfg80211: remove @gfp parameter from cfg80211_rx_mgmt() In the cfg80211_rx_mgmt(), parameter @gfp was used for the memory allocation. But, memory get allocated under spin_lock_bh(), this implies atomic context. So, one can't use GFP_KERNEL, only variants with no __GFP_WAIT. Actually, in all occurrences GFP_ATOMIC is used (wil6210 use GFP_KERNEL by mistake), and it should be this way or warning triggered in the memory allocation code. Remove @gfp parameter as no actual choice exist, and use hard coded GFP_ATOMIC for memory allocation. Signed-off-by: Vladimir Kondratiev Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/wmi.c | 5 ++--- drivers/net/wireless/ath/wil6210/wmi.c | 2 +- drivers/net/wireless/brcm80211/brcmfmac/p2p.c | 6 ++---- drivers/net/wireless/mwifiex/util.c | 2 +- drivers/staging/rtl8723au/core/rtw_mlme_ext.c | 2 +- drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c | 4 ++-- include/net/cfg80211.h | 3 +-- net/mac80211/rx.c | 2 +- net/wireless/mlme.c | 4 ++-- 9 files changed, 13 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 94df345d08c2..77fcca1f5bd6 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -619,8 +619,7 @@ static int ath6kl_wmi_rx_probe_req_event_rx(struct wmi *wmi, u8 *datap, int len, dlen, freq, vif->probe_req_report); if (vif->probe_req_report || vif->nw_type == AP_NETWORK) - cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, - GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0); return 0; } @@ -659,7 +658,7 @@ static int ath6kl_wmi_rx_action_event_rx(struct wmi *wmi, u8 *datap, int len, return -EINVAL; } ath6kl_dbg(ATH6KL_DBG_WMI, "rx_action: len=%u freq=%u\n", dlen, freq); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0); return 0; } diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 1d1d0afdd2e1..335bc38325db 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -350,7 +350,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len) } } else { cfg80211_rx_mgmt(wil->wdev, freq, signal, - (void *)rx_mgmt_frame, d_len, 0, GFP_KERNEL); + (void *)rx_mgmt_frame, d_len, 0); } } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 057b982ea8b3..1d78a91db594 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1431,8 +1431,7 @@ int brcmf_p2p_notify_action_frame_rx(struct brcmf_if *ifp, IEEE80211_BAND_5GHZ); wdev = &ifp->vif->wdev; - cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0, - GFP_ATOMIC); + cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0); kfree(mgmt_frame); return 0; @@ -1896,8 +1895,7 @@ s32 brcmf_p2p_notify_rx_mgmt_p2p_probereq(struct brcmf_if *ifp, IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0, - GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0); brcmf_dbg(INFO, "mgmt_frame_len (%d) , e->datalen (%d), chanspec (%04x), freq (%d)\n", mgmt_frame_len, e->datalen, chanspec, freq); diff --git a/drivers/net/wireless/mwifiex/util.c b/drivers/net/wireless/mwifiex/util.c index cee028321a9a..ec79c49de097 100644 --- a/drivers/net/wireless/mwifiex/util.c +++ b/drivers/net/wireless/mwifiex/util.c @@ -172,7 +172,7 @@ mwifiex_process_mgmt_packet(struct mwifiex_private *priv, cfg80211_rx_mgmt(priv->wdev, priv->roc_cfg.chan.center_freq, CAL_RSSI(rx_pd->snr, rx_pd->nf), skb->data, pkt_len, - 0, GFP_ATOMIC); + 0); return 0; } diff --git a/drivers/staging/rtl8723au/core/rtw_mlme_ext.c b/drivers/staging/rtl8723au/core/rtw_mlme_ext.c index c5fdcb89dacd..2a1502f351f8 100644 --- a/drivers/staging/rtl8723au/core/rtw_mlme_ext.c +++ b/drivers/staging/rtl8723au/core/rtw_mlme_ext.c @@ -2128,7 +2128,7 @@ static int on_action_public23a(struct rtw_adapter *padapter, IEEE80211_BAND_5GHZ); if (cfg80211_rx_mgmt(padapter->rtw_wdev, freq, 0, pframe, - skb->len, 0, GFP_ATOMIC)) + skb->len, 0)) return _SUCCESS; return _FAIL; diff --git a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c index 93dc844a10b3..2d6d7d1a5b7d 100644 --- a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c @@ -2379,7 +2379,7 @@ void rtw_cfg80211_indicate_sta_assoc(struct rtw_adapter *padapter, IEEE80211_BAND_5GHZ); cfg80211_rx_mgmt(padapter->rtw_wdev, freq, 0, pmgmt_frame, frame_len, - 0, GFP_ATOMIC); + 0); #endif /* defined(RTW_USE_CFG80211_STA_EVENT) */ } @@ -2425,7 +2425,7 @@ void rtw_cfg80211_indicate_sta_disassoc(struct rtw_adapter *padapter, frame_len = sizeof(struct ieee80211_hdr_3addr) + 2; cfg80211_rx_mgmt(padapter->rtw_wdev, freq, 0, (u8 *)&mgmt, frame_len, - 0, GFP_ATOMIC); + 0); #endif /* defined(RTW_USE_CFG80211_STA_EVENT) */ } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0a080c4de275..7b8dac3efe8f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4412,7 +4412,6 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * @buf: Management frame (header + body) * @len: length of the frame data * @flags: flags, as defined in enum nl80211_rxmgmt_flags - * @gfp: context flags * * This function is called whenever an Action frame is received for a station * mode interface, but is not processed in kernel. @@ -4423,7 +4422,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * driver is responsible for rejecting the frame. */ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp); + const u8 *buf, size_t len, u32 flags); /** * cfg80211_mgmt_tx_status - notification of TX status for management frame diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bd2c9b22c945..a8d862f9183c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2725,7 +2725,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) { + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 266766b8d80b..369fc334fdad 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -605,7 +605,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, } bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp) + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -648,7 +648,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, freq, sig_mbm, - buf, len, flags, gfp)) + buf, len, flags, GFP_ATOMIC)) continue; result = true; -- cgit v1.2.3 From ca34e3b5c808385b175650605faa29e71e91991b Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Tue, 29 Jul 2014 15:38:53 +0300 Subject: mac80211: Fix accounting of the tailroom-needed counter When hw acceleration is enabled, the GENERATE_IV or PUT_IV_SPACE flags will only require headroom space. Consequently, the tailroom-needed counter can safely be decremented. Signed-off-by: Ido Yariv Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 +++++-- net/mac80211/key.c | 12 +++--------- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 1cd84444665c..1fbed0a6d556 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1226,7 +1226,8 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * * @IEEE80211_KEY_FLAG_GENERATE_IV: This flag should be set by the * driver to indicate that it requires IV generation for this - * particular key. + * particular key. Setting this flag does not necessarily mean that SKBs + * will have sufficient tailroom for ICV or MIC. * @IEEE80211_KEY_FLAG_GENERATE_MMIC: This flag should be set by * the driver for a TKIP key if it requires Michael MIC * generation in software. @@ -1238,7 +1239,9 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * @IEEE80211_KEY_FLAG_PUT_IV_SPACE: This flag should be set by the driver * if space should be prepared for the IV, but the IV * itself should not be generated. Do not set together with - * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. + * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. Setting this flag does + * not necessarily mean that SKBs will have sufficient tailroom for ICV or + * MIC. * @IEEE80211_KEY_FLAG_RX_MGMT: This key will be used to decrypt received * management frames. The flag can help drivers that have a hardware * crypto implementation that doesn't deal with management frames diff --git a/net/mac80211/key.c b/net/mac80211/key.c index d808cff80153..6429d0e1d4a1 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -130,9 +130,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) sdata->crypto_tx_tailroom_needed_cnt--; WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && @@ -180,9 +178,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta = key->sta; sdata = key->sdata; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) increment_tailroom_need_count(sdata); ret = drv_set_key(key->local, DISABLE_KEY, sdata, @@ -878,9 +874,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) increment_tailroom_need_count(key->sdata); } -- cgit v1.2.3 From c70f59a2a007c57843195a93c3b7308204e0a5ab Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Tue, 29 Jul 2014 15:39:14 +0300 Subject: mac80211: don't resize skbs needlessly Header-less cloned skbs with sufficient headroom need not be cloned unless the tailroom is going to be modified. Fix ieee80211_skb_resize so it would only resize cloned skbs if either the header isn't released or the tailroom is going to be modified. Some drivers might have assumed that skbs are never cloned, so add a HW flag that explicitly permits cloned TX skbs. Drivers which do not modify TX skbs should set this flag to avoid copying skbs. Signed-off-by: Ido Yariv Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 ++++- net/mac80211/tx.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 1fbed0a6d556..c9b2bec8db47 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1609,6 +1609,9 @@ struct ieee80211_tx_control { * is not enabled the default action is to disconnect when getting the * CSA frame. * + * @IEEE80211_HW_SUPPORTS_CLONED_SKBS: The driver will never modify the payload + * or tailroom of TX skbs without copying them first. + * * @IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. */ @@ -1642,7 +1645,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TIMING_BEACON_ONLY = 1<<26, IEEE80211_HW_SUPPORTS_HT_CCK_RATES = 1<<27, IEEE80211_HW_CHANCTX_STA_CSA = 1<<28, - /* bit 29 unused */ + IEEE80211_HW_SUPPORTS_CLONED_SKBS = 1<<29, IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS = 1<<30, }; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2051cc60f8ce..925c39f4099e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1478,7 +1478,10 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, tail_need = max_t(int, tail_need, 0); } - if (skb_cloned(skb)) + if (skb_cloned(skb) && + (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || + !skb_clone_writable(skb, ETH_HLEN) || + sdata->crypto_tx_tailroom_needed_cnt)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); -- cgit v1.2.3 From f41ef64853fb1e02728e56b2d0d55aef8ed12b26 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Mon, 28 Jul 2014 15:21:05 +0200 Subject: cfg80211: re-enable CSA for drivers that support it This reverts commit dda444d52496aa8ddc501561bca580f1374a96a9. Channel switching code has been reworked and improved significantly since the time original locking issues were found. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/wireless/core.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index afee5e0455ea..a207eb116829 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -492,12 +492,6 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; - /* - * There are major locking problems in nl80211/mac80211 for CSA, - * disable for all drivers until this has been reworked. - */ - wiphy->flags &= ~WIPHY_FLAG_HAS_CHANNEL_SWITCH; - #ifdef CONFIG_PM if (WARN_ON(wiphy->wowlan && (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && -- cgit v1.2.3 From 0e227084aee36b3ba27b4fc9cd9e425be6ce2ab8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 12 Aug 2014 20:34:30 +0200 Subject: cfg80211: clarify BSS probe response vs. beacon data There are a few possible cases of where BSS data came from: 1) only a beacon has been received 2) only a probe response has been received 3) the driver didn't report what it received (this happens when using cfg80211_inform_bss[_width]()) 4) both probe response and beacon data has been received Unfortunately, in the userspace API, a few things weren't there: a) there was no way to differentiate cases 1) and 4) above without comparing the data of the IEs b) the TSF was always from the last frame, instead of being exposed for beacon/probe response separately like IEs Fix this by i) exporting a new flag attribute that indicates whether or not probe response data has been received - this addresses (a) ii) exporting a BEACON_TSF attribute that holds the beacon's TSF if a beacon has been received iii) not exporting the beacon attributes in case (3) above as that would just lead userspace into thinking the data actually came from a beacon when that isn't clear To implement this, track inside the IEs struct whether or not it (definitely) came from a beacon. Reported-by: William Seto Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 16 ++++++++++++++-- net/wireless/nl80211.c | 16 ++++++++++++---- net/wireless/scan.c | 6 ++++-- 4 files changed, 32 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b8dac3efe8f..77b85a89abca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1503,12 +1503,14 @@ enum cfg80211_signal_type { * @tsf: TSF contained in the frame that carried these IEs * @rcu_head: internal use, for freeing * @len: length of the IEs + * @from_beacon: these IEs are known to come from a beacon * @data: IE data */ struct cfg80211_bss_ies { u64 tsf; struct rcu_head rcu_head; int len; + bool from_beacon; u8 data[]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f1db15b9c041..d097568da690 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3055,14 +3055,20 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_BSSID: BSSID of the BSS (6 octets) * @NL80211_BSS_FREQUENCY: frequency in MHz (u32) * @NL80211_BSS_TSF: TSF of the received probe response/beacon (u64) + * (if @NL80211_BSS_PRESP_DATA is present then this is known to be + * from a probe response, otherwise it may be from the same beacon + * that the NL80211_BSS_BEACON_TSF will be from) * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16) * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16) * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the * raw information elements from the probe response/beacon (bin); - * if the %NL80211_BSS_BEACON_IES attribute is present, the IEs here are - * from a Probe Response frame; otherwise they are from a Beacon frame. + * if the %NL80211_BSS_BEACON_IES attribute is present and the data is + * different then the IEs here are from a Probe Response frame; otherwise + * they are from a Beacon frame. * However, if the driver does not indicate the source of the IEs, these * IEs may be from either frame subtype. + * If present, the @NL80211_BSS_PRESP_DATA attribute indicates that the + * data here is known to be from a probe response, without any heuristics. * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon * in mBm (100 * dBm) (s32) * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon @@ -3074,6 +3080,10 @@ enum nl80211_bss_scan_width { * yet been received * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel * (u32, enum nl80211_bss_scan_width) + * @NL80211_BSS_BEACON_TSF: TSF of the last received beacon (u64) + * (not present if no beacon frame has been received yet) + * @NL80211_BSS_PRESP_DATA: the data in @NL80211_BSS_INFORMATION_ELEMENTS and + * @NL80211_BSS_TSF is known to be from a probe response (flag attribute) * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -3091,6 +3101,8 @@ enum nl80211_bss { NL80211_BSS_SEEN_MS_AGO, NL80211_BSS_BEACON_IES, NL80211_BSS_CHAN_WIDTH, + NL80211_BSS_BEACON_TSF, + NL80211_BSS_PRESP_DATA, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index df7b1332a1ec..3011401f52c0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6033,7 +6033,6 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, const struct cfg80211_bss_ies *ies; void *hdr; struct nlattr *bss; - bool tsf = false; ASSERT_WDEV_LOCK(wdev); @@ -6060,18 +6059,27 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; rcu_read_lock(); + /* indicate whether we have probe response data or not */ + if (rcu_access_pointer(res->proberesp_ies) && + nla_put_flag(msg, NL80211_BSS_PRESP_DATA)) + goto fail_unlock_rcu; + + /* this pointer prefers to be pointed to probe response data + * but is always valid + */ ies = rcu_dereference(res->ies); if (ies) { if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) goto fail_unlock_rcu; - tsf = true; if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, ies->len, ies->data)) goto fail_unlock_rcu; } + + /* and this pointer is always (unless driver didn't know) beacon data */ ies = rcu_dereference(res->beacon_ies); - if (ies) { - if (!tsf && nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + if (ies && ies->from_beacon) { + if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf)) goto fail_unlock_rcu; if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, ies->len, ies->data)) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0798c62e6085..ad1a1a2808d3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -918,11 +918,12 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = tsf; + ies->from_beacon = false; memcpy(ies->data, ie, ielen); rcu_assign_pointer(tmp.pub.beacon_ies, ies); @@ -982,11 +983,12 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, if (!channel) return NULL; - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); + ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control); memcpy(ies->data, mgmt->u.probe_resp.variable, ielen); if (ieee80211_is_probe_resp(mgmt->frame_control)) -- cgit v1.2.3 From 5bc8c1f2b070bab82ed738f98ecfac725e33c57f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 12 Aug 2014 21:01:28 +0200 Subject: cfg80211: allow passing frame type to cfg80211_inform_bss() When using the cfg80211_inform_bss[_width]() functions drivers cannot currently indicate whether the data was received in a beacon or probe response. Fix that by passing a new enum that indicates such (or unknown). For good measure, use it in ath6kl. Acked-by: Kalle Valo [ath6kl] Acked-by: Arend van Spriel [brcmfmac] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 1 + drivers/net/wireless/ath/ath6kl/wmi.c | 43 +++++----------------- drivers/net/wireless/ath/wil6210/wmi.c | 2 +- .../net/wireless/brcm80211/brcmfmac/wl_cfg80211.c | 18 ++++++--- drivers/net/wireless/libertas/cfg.c | 2 + drivers/net/wireless/mwifiex/cfg80211.c | 1 + drivers/net/wireless/mwifiex/scan.c | 3 +- drivers/net/wireless/orinoco/scan.c | 14 ++++--- drivers/net/wireless/rndis_wlan.c | 14 ++++--- drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c | 1 + drivers/staging/wlan-ng/cfg80211.c | 1 + include/net/cfg80211.h | 20 +++++++++- net/wireless/scan.c | 15 +++++++- 13 files changed, 77 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index e535807c3d89..ba60e37213eb 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -717,6 +717,7 @@ ath6kl_add_bss_if_needed(struct ath6kl_vif *vif, memcpy(ie + 2, vif->ssid, vif->ssid_len); memcpy(ie + 2 + vif->ssid_len, beacon_ie, beacon_ie_len); bss = cfg80211_inform_bss(ar->wiphy, chan, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, 0, cap_val, 100, ie, 2 + vif->ssid_len + beacon_ie_len, 0, GFP_KERNEL); diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 77fcca1f5bd6..b921005ad7ee 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -1092,7 +1092,6 @@ static int ath6kl_wmi_bssinfo_event_rx(struct wmi *wmi, u8 *datap, int len, u8 *buf; struct ieee80211_channel *channel; struct ath6kl *ar = wmi->parent_dev; - struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; if (len <= sizeof(struct wmi_bss_info_hdr2)) @@ -1138,39 +1137,15 @@ static int ath6kl_wmi_bssinfo_event_rx(struct wmi *wmi, u8 *datap, int len, } } - /* - * In theory, use of cfg80211_inform_bss() would be more natural here - * since we do not have the full frame. However, at least for now, - * cfg80211 can only distinguish Beacon and Probe Response frames from - * each other when using cfg80211_inform_bss_frame(), so let's build a - * fake IEEE 802.11 header to be able to take benefit of this. - */ - mgmt = kmalloc(24 + len, GFP_ATOMIC); - if (mgmt == NULL) - return -EINVAL; - - if (bih->frame_type == BEACON_FTYPE) { - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_BEACON); - memset(mgmt->da, 0xff, ETH_ALEN); - } else { - struct net_device *dev = vif->ndev; - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_PROBE_RESP); - memcpy(mgmt->da, dev->dev_addr, ETH_ALEN); - } - mgmt->duration = cpu_to_le16(0); - memcpy(mgmt->sa, bih->bssid, ETH_ALEN); - memcpy(mgmt->bssid, bih->bssid, ETH_ALEN); - mgmt->seq_ctrl = cpu_to_le16(0); - - memcpy(&mgmt->u.beacon, buf, len); - - bss = cfg80211_inform_bss_frame(ar->wiphy, channel, mgmt, - 24 + len, (bih->snr - 95) * 100, - GFP_ATOMIC); - kfree(mgmt); + bss = cfg80211_inform_bss(ar->wiphy, channel, + bih->frame_type == BEACON_FTYPE ? + CFG80211_BSS_FTYPE_BEACON : + CFG80211_BSS_FTYPE_PRESP, + bih->bssid, get_unaligned_le64((__le64 *)buf), + get_unaligned_le16(((__le16 *)buf) + 5), + get_unaligned_le16(((__le16 *)buf) + 4), + buf + 8 + 2 + 2, len - 8 - 2 - 2, + (bih->snr - 95) * 100, GFP_ATOMIC); if (bss == NULL) return -ENOMEM; cfg80211_put_bss(ar->wiphy, bss); diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 335bc38325db..960b66fc1430 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -346,7 +346,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len) rx_mgmt_frame->bssid); cfg80211_put_bss(wiphy, bss); } else { - wil_err(wil, "cfg80211_inform_bss() failed\n"); + wil_err(wil, "cfg80211_inform_bss_frame() failed\n"); } } else { cfg80211_rx_mgmt(wil->wdev, freq, signal, diff --git a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c index 02fe706fc9ec..12a60ca1462a 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c @@ -2394,9 +2394,13 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg, brcmf_dbg(CONN, "Beacon interval: %d\n", notify_interval); brcmf_dbg(CONN, "Signal: %d\n", notify_signal); - bss = cfg80211_inform_bss(wiphy, notify_channel, (const u8 *)bi->BSSID, - 0, notify_capability, notify_interval, notify_ie, - notify_ielen, notify_signal, GFP_KERNEL); + bss = cfg80211_inform_bss(wiphy, notify_channel, + CFG80211_BSS_FTYPE_UNKNOWN, + (const u8 *)bi->BSSID, + 0, notify_capability, + notify_interval, notify_ie, + notify_ielen, notify_signal, + GFP_KERNEL); if (!bss) return -ENOMEM; @@ -2498,9 +2502,11 @@ static s32 wl_inform_ibss(struct brcmf_cfg80211_info *cfg, brcmf_dbg(CONN, "beacon interval: %d\n", notify_interval); brcmf_dbg(CONN, "signal: %d\n", notify_signal); - bss = cfg80211_inform_bss(wiphy, notify_channel, bssid, - 0, notify_capability, notify_interval, - notify_ie, notify_ielen, notify_signal, GFP_KERNEL); + bss = cfg80211_inform_bss(wiphy, notify_channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, 0, + notify_capability, notify_interval, + notify_ie, notify_ielen, notify_signal, + GFP_KERNEL); if (!bss) { err = -ENOMEM; diff --git a/drivers/net/wireless/libertas/cfg.c b/drivers/net/wireless/libertas/cfg.c index 47a998d8f99e..22884ba7d6cc 100644 --- a/drivers/net/wireless/libertas/cfg.c +++ b/drivers/net/wireless/libertas/cfg.c @@ -653,6 +653,7 @@ static int lbs_ret_scan(struct lbs_private *priv, unsigned long dummy, if (channel && !(channel->flags & IEEE80211_CHAN_DISABLED)) { bss = cfg80211_inform_bss(wiphy, channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, get_unaligned_le64(tsfdesc), capa, intvl, ie, ielen, LBS_SCAN_RSSI_TO_MBM(rssi), @@ -1754,6 +1755,7 @@ static void lbs_join_post(struct lbs_private *priv, bss = cfg80211_inform_bss(priv->wdev->wiphy, params->chandef.chan, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, 0, capability, diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index e2e6bf13c2d8..15f994f3b4ce 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c @@ -1557,6 +1557,7 @@ static int mwifiex_cfg80211_inform_ibss_bss(struct mwifiex_private *priv) band)); bss = cfg80211_inform_bss(priv->wdev->wiphy, chan, + CFG80211_BSS_FTYPE_UNKNOWN, bss_info.bssid, 0, WLAN_CAPABILITY_IBSS, 0, ie_buf, ie_len, 0, GFP_KERNEL); cfg80211_put_bss(priv->wdev->wiphy, bss); diff --git a/drivers/net/wireless/mwifiex/scan.c b/drivers/net/wireless/mwifiex/scan.c index dee717a19ddb..195ef0ca343f 100644 --- a/drivers/net/wireless/mwifiex/scan.c +++ b/drivers/net/wireless/mwifiex/scan.c @@ -1719,7 +1719,8 @@ mwifiex_parse_single_response_buf(struct mwifiex_private *priv, u8 **bss_info, if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) { bss = cfg80211_inform_bss(priv->wdev->wiphy, - chan, bssid, timestamp, + chan, CFG80211_BSS_FTYPE_UNKNOWN, + bssid, timestamp, cap_info_bitmap, beacon_period, ie_buf, ie_len, rssi, GFP_KERNEL); bss_priv = (struct mwifiex_bss_priv *)bss->priv; diff --git a/drivers/net/wireless/orinoco/scan.c b/drivers/net/wireless/orinoco/scan.c index e175b9b8561b..2c66166add70 100644 --- a/drivers/net/wireless/orinoco/scan.c +++ b/drivers/net/wireless/orinoco/scan.c @@ -123,9 +123,10 @@ static void orinoco_add_hostscan_result(struct orinoco_private *priv, beacon_interval = le16_to_cpu(bss->a.beacon_interv); signal = SIGNAL_TO_MBM(le16_to_cpu(bss->a.level)); - cbss = cfg80211_inform_bss(wiphy, channel, bss->a.bssid, timestamp, - capability, beacon_interval, ie_buf, ie_len, - signal, GFP_KERNEL); + cbss = cfg80211_inform_bss(wiphy, channel, CFG80211_BSS_FTYPE_UNKNOWN, + bss->a.bssid, timestamp, capability, + beacon_interval, ie_buf, ie_len, signal, + GFP_KERNEL); cfg80211_put_bss(wiphy, cbss); } @@ -156,9 +157,10 @@ void orinoco_add_extscan_result(struct orinoco_private *priv, ie = bss->data; signal = SIGNAL_TO_MBM(bss->level); - cbss = cfg80211_inform_bss(wiphy, channel, bss->bssid, timestamp, - capability, beacon_interval, ie, ie_len, - signal, GFP_KERNEL); + cbss = cfg80211_inform_bss(wiphy, channel, CFG80211_BSS_FTYPE_UNKNOWN, + bss->bssid, timestamp, capability, + beacon_interval, ie, ie_len, signal, + GFP_KERNEL); cfg80211_put_bss(wiphy, cbss); } diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c index d2a9a08210be..1a4facd1fbf3 100644 --- a/drivers/net/wireless/rndis_wlan.c +++ b/drivers/net/wireless/rndis_wlan.c @@ -2022,9 +2022,10 @@ static bool rndis_bss_info_update(struct usbnet *usbdev, capability = le16_to_cpu(fixed->capabilities); beacon_interval = le16_to_cpu(fixed->beacon_interval); - bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, bssid->mac, - timestamp, capability, beacon_interval, ie, ie_len, signal, - GFP_KERNEL); + bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid->mac, + timestamp, capability, beacon_interval, + ie, ie_len, signal, GFP_KERNEL); cfg80211_put_bss(priv->wdev.wiphy, bss); return (bss != NULL); @@ -2711,9 +2712,10 @@ static void rndis_wlan_craft_connected_bss(struct usbnet *usbdev, u8 *bssid, bssid, (u32)timestamp, capability, beacon_period, ie_len, ssid.essid, signal); - bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, bssid, - timestamp, capability, beacon_period, ie_buf, ie_len, - signal, GFP_KERNEL); + bss = cfg80211_inform_bss(priv->wdev.wiphy, channel, + CFG80211_BSS_FTYPE_UNKNOWN, bssid, + timestamp, capability, beacon_period, + ie_buf, ie_len, signal, GFP_KERNEL); cfg80211_put_bss(priv->wdev.wiphy, bss); } diff --git a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c index 2d6d7d1a5b7d..8b0ccb5c5fc4 100644 --- a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c @@ -279,6 +279,7 @@ static int rtw_cfg80211_inform_bss(struct rtw_adapter *padapter, } bss = cfg80211_inform_bss(wiphy, notify_channel, + CFG80211_BSS_FTYPE_UNKNOWN, pnetwork->network.MacAddress, pnetwork->network.tsf, pnetwork->network.capability, diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c index 3727f6d25cf1..8942dcb44180 100644 --- a/drivers/staging/wlan-ng/cfg80211.c +++ b/drivers/staging/wlan-ng/cfg80211.c @@ -422,6 +422,7 @@ static int prism2_scan(struct wiphy *wiphy, IEEE80211_BAND_2GHZ); bss = cfg80211_inform_bss(wiphy, ieee80211_get_channel(wiphy, freq), + CFG80211_BSS_FTYPE_UNKNOWN, (const u8 *) &(msg2.bssid.data.data), msg2.timestamp.data, msg2.capinfo.data, msg2.beaconperiod.data, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 77b85a89abca..ab21299c8f4d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3767,11 +3767,25 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, } /** - * cfg80211_inform_bss - inform cfg80211 of a new BSS + * enum cfg80211_bss_frame_type - frame type that the BSS data came from + * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is + * from a beacon or probe response + * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon + * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response + */ +enum cfg80211_bss_frame_type { + CFG80211_BSS_FTYPE_UNKNOWN, + CFG80211_BSS_FTYPE_BEACON, + CFG80211_BSS_FTYPE_PRESP, +}; + +/** + * cfg80211_inform_bss_width - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS * @rx_channel: The channel the frame was received on * @scan_width: width of the control channel + * @ftype: frame type (if known) * @bssid: the BSSID of the BSS * @tsf: the TSF sent by the peer in the beacon/probe response (or 0) * @capability: the capability field sent by the peer @@ -3791,6 +3805,7 @@ struct cfg80211_bss * __must_check cfg80211_inform_bss_width(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp); @@ -3798,12 +3813,13 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp) { return cfg80211_inform_bss_width(wiphy, rx_channel, - NL80211_BSS_CHAN_WIDTH_20, + NL80211_BSS_CHAN_WIDTH_20, ftype, bssid, tsf, capability, beacon_interval, ie, ielen, signal, gfp); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ad1a1a2808d3..620a4b40d466 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -884,6 +884,7 @@ struct cfg80211_bss* cfg80211_inform_bss_width(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp) @@ -911,7 +912,7 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; /* - * Since we do not know here whether the IEs are from a Beacon or Probe + * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should @@ -926,7 +927,17 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, ies->from_beacon = false; memcpy(ies->data, ie, ielen); - rcu_assign_pointer(tmp.pub.beacon_ies, ies); + switch (ftype) { + case CFG80211_BSS_FTYPE_BEACON: + ies->from_beacon = true; + /* fall through to assign */ + case CFG80211_BSS_FTYPE_UNKNOWN: + rcu_assign_pointer(tmp.pub.beacon_ies, ies); + break; + case CFG80211_BSS_FTYPE_PRESP: + rcu_assign_pointer(tmp.pub.proberesp_ies, ies); + break; + } rcu_assign_pointer(tmp.pub.ies, ies); signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= -- cgit v1.2.3 From ad053a962f1e83597ec8388716619c7633a00ef1 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 16:14:49 +0300 Subject: mac80211: scan: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index a0a938145dcc..a9bb6eb8c3e0 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -1094,7 +1094,7 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) if (rcu_access_pointer(local->sched_scan_sdata)) { ret = drv_sched_scan_stop(local, sdata); if (!ret) - rcu_assign_pointer(local->sched_scan_sdata, NULL); + RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } out: mutex_unlock(&local->mtx); -- cgit v1.2.3 From f111f780ae1abf4cdc464f24293be90c010a04f6 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Wed, 20 Aug 2014 22:03:18 +0400 Subject: netfilter: nfnetlink_acct: add filter support to nfacct counter list/reset You can use this to skip accounting objects when listing/resetting via NFNL_MSG_ACCT_GET/NFNL_MSG_ACCT_GET_CTRZERO messages with the NLM_F_DUMP netlink flag. The filtering covers the following cases: 1. No filter specified. In this case, the client will get old behaviour, 2. List/reset counter object only: In this case, you have to use NFACCT_F_QUOTA as mask and value 0. 3. List/reset quota objects only: You have to use NFACCT_F_QUOTA_PKTS as mask and value - the same, for byte based quota mask should be NFACCT_F_QUOTA_BYTES and value - the same. If you want to obtain the object with any quota type (ie. NFACCT_F_QUOTA_PKTS|NFACCT_F_QUOTA_BYTES), you need to perform two dump requests, one to obtain NFACCT_F_QUOTA_PKTS objects and another for NFACCT_F_QUOTA_BYTES. Signed-off-by: Alexey Perevalov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_acct.h | 8 ++++ net/netfilter/nfnetlink_acct.c | 54 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nfnetlink_acct.h b/include/uapi/linux/netfilter/nfnetlink_acct.h index 51404ec19022..f3e34dbbf966 100644 --- a/include/uapi/linux/netfilter/nfnetlink_acct.h +++ b/include/uapi/linux/netfilter/nfnetlink_acct.h @@ -28,9 +28,17 @@ enum nfnl_acct_type { NFACCT_USE, NFACCT_FLAGS, NFACCT_QUOTA, + NFACCT_FILTER, __NFACCT_MAX }; #define NFACCT_MAX (__NFACCT_MAX - 1) +enum nfnl_attr_filter_type { + NFACCT_FILTER_UNSPEC, + NFACCT_FILTER_MASK, + NFACCT_FILTER_VALUE, + __NFACCT_FILTER_MAX +}; +#define NFACCT_FILTER_MAX (__NFACCT_FILTER_MAX - 1) #endif /* _UAPI_NFNL_ACCT_H_ */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 3ea0eacbd970..c18af2f63eef 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -40,6 +40,11 @@ struct nf_acct { char data[0]; }; +struct nfacct_filter { + u32 value; + u32 mask; +}; + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ @@ -181,6 +186,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_acct *cur, *last; + const struct nfacct_filter *filter = cb->data; if (cb->args[2]) return 0; @@ -197,6 +203,10 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) last = NULL; } + + if (filter && (cur->flags & filter->mask) != filter->value) + continue; + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), @@ -211,6 +221,38 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int nfnl_acct_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { + [NFACCT_FILTER_MASK] = { .type = NLA_U32 }, + [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, +}; + +static struct nfacct_filter * +nfacct_filter_alloc(const struct nlattr * const attr) +{ + struct nfacct_filter *filter; + struct nlattr *tb[NFACCT_FILTER_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy); + if (err < 0) + return ERR_PTR(err); + + filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); + filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); + + return filter; +} + static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) @@ -222,7 +264,18 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, + .done = nfnl_acct_done, }; + + if (tb[NFACCT_FILTER]) { + struct nfacct_filter *filter; + + filter = nfacct_filter_alloc(tb[NFACCT_FILTER]); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + c.data = filter; + } return netlink_dump_start(nfnl, skb, nlh, &c); } @@ -314,6 +367,7 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_PKTS] = { .type = NLA_U64 }, [NFACCT_FLAGS] = { .type = NLA_U32 }, [NFACCT_QUOTA] = { .type = NLA_U64 }, + [NFACCT_FILTER] = {.type = NLA_NESTED }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { -- cgit v1.2.3 From 2688eba9d5ba2a35ac8e5efa2e228f105f0a379f Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Sun, 17 Aug 2014 16:18:02 +0300 Subject: mac80211: Replace rcu_dereference() with rcu_access_pointer() The "rcu_dereference()" calls are used directly in conditions. Since their return values are never dereferenced it is recommended to use "rcu_access_pointer()" instead of "rcu_dereference()". Therefore, this patch makes the replacements. The following Coccinelle semantic patch was used: @@ @@ ( if( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} | while( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} ) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Johannes Berg --- net/mac80211/mesh_pathtbl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index cf032a8db9d7..a6699dceae7c 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -729,7 +729,7 @@ void mesh_plink_broken(struct sta_info *sta) tbl = rcu_dereference(mesh_paths); for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; - if (rcu_dereference(mpath->next_hop) == sta && + if (rcu_access_pointer(mpath->next_hop) == sta && mpath->flags & MESH_PATH_ACTIVE && !(mpath->flags & MESH_PATH_FIXED)) { spin_lock_bh(&mpath->state_lock); @@ -794,7 +794,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) tbl = resize_dereference_mesh_paths(); for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; - if (rcu_dereference(mpath->next_hop) == sta) { + if (rcu_access_pointer(mpath->next_hop) == sta) { spin_lock(&tbl->hashwlock[i]); __mesh_path_del(tbl, node); spin_unlock(&tbl->hashwlock[i]); -- cgit v1.2.3 From 253ff51635ad6690276ef065d59523c4bd1cd584 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Aug 2014 12:55:53 +0200 Subject: tcp: syncookies: mark cookie_secret read_mostly only written once. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- net/ipv6/syncookies.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c0c75688896e..0431a8f3c8f4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -25,7 +25,7 @@ extern int sysctl_tcp_syncookies; -static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 83cea1d39466..c643dc907ce7 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -24,7 +24,7 @@ #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS]; +static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] -- cgit v1.2.3 From 3e8a72d1dae374cf6fc1dba97cec663585845ff9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:46 -0700 Subject: net: dsa: reduce number of protocol hooks DSA is currently registering one packet_type function per EtherType it needs to intercept in the receive path of a DSA-enabled Ethernet device. Right now we have three of them: trailer, DSA and eDSA, and there might be more in the future, this will not scale to the addition of new protocols. This patch proceeds with adding a new layer of abstraction and two new functions: dsa_switch_rcv() which will dispatch into the tag-protocol specific receive function implemented by net/dsa/tag_*.c dsa_slave_xmit() which will dispatch into the tag-protocol specific transmit function implemented by net/dsa/tag_*.c When we do create the per-port slave network devices, we iterate over the switch protocol to assign the DSA-specific receive and transmit operations. A new fake ethertype value is used: ETH_P_XDSA to illustrate the fact that this is no longer going to look like ETH_P_DSA or ETH_P_TRAILER like it used to be. This allows us to greatly simplify the check in eth_type_trans() and always override the skb->protocol with ETH_P_XDSA for Ethernet switches tagged protocol, while also reducing the number repetitive slave netdevice_ops assignments. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 28 ++++++++++++--------------- include/net/dsa.h | 20 +++---------------- include/uapi/linux/if_ether.h | 1 + net/dsa/dsa.c | 39 ++++++++++++++++++++----------------- net/dsa/dsa_priv.h | 9 +++------ net/dsa/slave.c | 45 ++++++++++++++----------------------------- net/dsa/tag_dsa.c | 8 ++++---- net/dsa/tag_edsa.c | 8 ++++---- net/dsa/tag_trailer.c | 8 ++++---- net/ethernet/eth.c | 7 ++----- 10 files changed, 68 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 039b23786c22..1875dc71422a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1781,24 +1781,13 @@ void dev_net_set(struct net_device *dev, struct net *net) #endif } -static inline bool netdev_uses_dsa_tags(struct net_device *dev) +static inline bool netdev_uses_dsa(struct net_device *dev) { -#ifdef CONFIG_NET_DSA_TAG_DSA - if (dev->dsa_ptr != NULL) - return dsa_uses_dsa_tags(dev->dsa_ptr); -#endif - - return 0; -} - -static inline bool netdev_uses_trailer_tags(struct net_device *dev) -{ -#ifdef CONFIG_NET_DSA_TAG_TRAILER - if (dev->dsa_ptr != NULL) - return dsa_uses_trailer_tags(dev->dsa_ptr); +#ifdef CONFIG_NET_DSA + return dev->dsa_ptr != NULL; +#else + return false; #endif - - return 0; } /** @@ -1933,6 +1922,13 @@ struct udp_offload { struct offload_callbacks callbacks; }; +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; + + /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 6efce384451e..6e26f1e4d8ce 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -59,6 +59,8 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; +struct dsa_device_ops; + struct dsa_switch_tree { /* * Configuration data for the platform device that owns @@ -71,6 +73,7 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; + const struct dsa_device_ops *ops; __be16 tag_protocol; /* @@ -186,21 +189,4 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } -/* - * The original DSA tag format and some other tag formats have no - * ethertype, which means that we need to add a little hack to the - * networking receive path to make sure that received frames get - * the right ->protocol assigned to them when one of those tag - * formats is in use. - */ -static inline bool dsa_uses_dsa_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_DSA)); -} - -static inline bool dsa_uses_trailer_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); -} - #endif diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 0f8210b8e0bc..aa63ed023c2b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -128,6 +128,7 @@ #define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ #define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ #define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ +#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ /* * This is an Ethernet frame header. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0a49632fac47..92e71d2a2ccd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -608,6 +608,24 @@ static void dsa_shutdown(struct platform_device *pdev) { } +static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + + if (unlikely(dst == NULL)) { + kfree_skb(skb); + return 0; + } + + return dst->ops->rcv(skb, dev, pt, orig_dev); +} + +struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, +}; + static const struct of_device_id dsa_of_match_table[] = { { .compatible = "marvell,dsa", }, {} @@ -633,30 +651,15 @@ static int __init dsa_init_module(void) if (rc) return rc; -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_add_pack(&dsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_add_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_add_pack(&trailer_packet_type); -#endif + dev_add_pack(&dsa_pack_type); + return 0; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_remove_pack(&trailer_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_remove_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_remove_pack(&dsa_packet_type); -#endif + dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d4cf5cc747e3..218d75d16f6f 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -45,16 +45,13 @@ struct net_device *dsa_slave_create(struct dsa_switch *ds, int port, char *name); /* tag_dsa.c */ -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type dsa_packet_type; +extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type edsa_packet_type; +extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type trailer_packet_type; +extern const struct dsa_device_ops trailer_netdev_ops; #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 45a1e34c89e0..ad1a913533aa 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -171,6 +171,14 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } +static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch_tree *dst = p->parent->dst; + + return dst->ops->xmit(skb, dev); +} + /* ethtool operations *******************************************************/ static int @@ -293,42 +301,16 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_sset_count = dsa_slave_get_sset_count, }; -#ifdef CONFIG_NET_DSA_TAG_DSA -static const struct net_device_ops dsa_netdev_ops = { +static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, - .ndo_start_xmit = dsa_xmit, + .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_do_ioctl = dsa_slave_ioctl, }; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA -static const struct net_device_ops edsa_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = edsa_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER -static const struct net_device_ops trailer_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = trailer_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif /* slave device setup *******************************************************/ struct net_device * @@ -349,21 +331,22 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; + slave_dev->netdev_ops = &dsa_slave_netdev_ops; switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case htons(ETH_P_DSA): - slave_dev->netdev_ops = &dsa_netdev_ops; + ds->dst->ops = &dsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case htons(ETH_P_EDSA): - slave_dev->netdev_ops = &edsa_netdev_ops; + ds->dst->ops = &edsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case htons(ETH_P_TRAILER): - slave_dev->netdev_ops = &trailer_netdev_ops; + ds->dst->ops = &trailer_netdev_ops; break; #endif default: diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cacce1e22f9c..d7dbc5bda5c0 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -16,7 +16,7 @@ #define DSA_HLEN 4 -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; @@ -186,7 +186,7 @@ out: return 0; } -struct packet_type dsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_DSA), - .func = dsa_rcv, +const struct dsa_device_ops dsa_netdev_ops = { + .xmit = dsa_xmit, + .rcv = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index e70c43c25e64..6b30abe89183 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -17,7 +17,7 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; @@ -205,7 +205,7 @@ out: return 0; } -struct packet_type edsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_EDSA), - .func = edsa_rcv, +const struct dsa_device_ops edsa_netdev_ops = { + .xmit = edsa_xmit, + .rcv = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 94bc260d015d..5fe9444842c5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -14,7 +14,7 @@ #include #include "dsa_priv.h" -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; @@ -114,7 +114,7 @@ out: return 0; } -struct packet_type trailer_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TRAILER), - .func = trailer_rcv, +const struct dsa_device_ops trailer_netdev_ops = { + .xmit = trailer_xmit, + .rcv = trailer_rcv, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f405e0592407..5cebca134585 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -181,11 +181,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ - if (unlikely(netdev_uses_dsa_tags(dev))) - return htons(ETH_P_DSA); - - if (unlikely(netdev_uses_trailer_tags(dev))) - return htons(ETH_P_TRAILER); + if (unlikely(netdev_uses_dsa(dev))) + return htons(ETH_P_XDSA); if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) return eth->h_proto; -- cgit v1.2.3 From fa981d9af82e08f316ed25ed43078f995cc4be0a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:49 -0700 Subject: net: dsa: provide a switch device device tree node pointer We might need to fetch additional resources from the device tree node pointer, such as register ranges or other properties. Keep a device_node pointer around for this. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ net/dsa/dsa.c | 1 + 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6e26f1e4d8ce..decc62709acd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -15,6 +15,7 @@ #include #include #include +#include #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -26,6 +27,12 @@ struct dsa_chip_data { struct device *mii_bus; int sw_addr; + /* Device tree node pointer for this specific switch chip + * used during switch setup in case additional properties + * and resources needs to be used + */ + struct device_node *of_node; + /* * The names of the switch's ports. Use "cpu" to * designate the switch port that the cpu is connected to, diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 92e71d2a2ccd..a28ef432d016 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -410,6 +410,7 @@ static int dsa_of_probe(struct platform_device *pdev) chip_index++; cd = &pd->chip[chip_index]; + cd->of_node = child; cd->mii_bus = &mdio_bus->dev; sw_addr = of_get_property(child, "reg", NULL); -- cgit v1.2.3 From bd47497a0171b96264927e3377254db13b9fe3e3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:50 -0700 Subject: net: dsa: retain a per-port device_node pointer We will later use the per-port device_node pointer to fetch a bunch of port-specific properties. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 2 ++ net/dsa/slave.c | 1 + 3 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index decc62709acd..597875d3f69e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -41,6 +41,7 @@ struct dsa_chip_data { * or any other string to indicate this is a physical port. */ char *port_names[DSA_MAX_PORTS]; + struct device_node *port_dn[DSA_MAX_PORTS]; /* * An array (with nr_chips elements) of which element [a] diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a28ef432d016..6a5bae673037 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -432,6 +432,8 @@ static int dsa_of_probe(struct platform_device *pdev) if (!port_name) continue; + cd->port_dn[port_index] = port; + cd->port_names[port_index] = kstrdup(port_name, GFP_KERNEL); if (!cd->port_names[port_index]) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ad1a913533aa..5688c34253e5 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -354,6 +354,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, } SET_NETDEV_DEV(slave_dev, parent); + slave_dev->dev.of_node = ds->pd->port_dn[port]; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); -- cgit v1.2.3 From 0d8bcdd383b8865e752a7e8edb4712c2e3902052 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:51 -0700 Subject: net: dsa: allow for more complex PHY setups Modify the DSA slave interface to be bound to an arbitray PHY, not just the ones that are available as child PHY devices of the switch MDIO bus. This allows us for instance to have external PHYs connected to a separate MDIO bus, but yet also connected to a given switch port. Under certain configurations, the physical port mask might not be a 1:1 mapping to the MII PHYs mask. This is the case, if e.g: Port 1 of the switch is used and connects to a PHY at a MDIO address different than 1. Introduce a phys_mii_mask variable which allows driver to implement and divert their own MDIO read/writes operations for a subset of the MDIO PHY addresses. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 5 ++++ net/dsa/dsa_priv.h | 4 +++ net/dsa/slave.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 83 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 597875d3f69e..dc357454ae3b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -130,6 +130,7 @@ struct dsa_switch { */ u32 dsa_port_mask; u32 phys_port_mask; + u32 phys_mii_mask; struct mii_bus *slave_mii_bus; struct net_device *ports[DSA_MAX_PORTS]; }; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6a5bae673037..4dc2a16b72cf 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -144,6 +144,11 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, goto out; } + /* Make the built-in MII bus mask match the number of ports, + * switch drivers can override this later + */ + ds->phys_mii_mask = ds->phys_port_mask; + /* * If the CPU connects to this switch, set the switch tree * tagging protocol to the preferred tagging format of this diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 218d75d16f6f..d20364ac1574 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -33,6 +33,10 @@ struct dsa_slave_priv { * to this port. */ struct phy_device *phy; + phy_interface_t phy_interface; + int old_link; + int old_pause; + int old_duplex; }; /* dsa.c */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5688c34253e5..03d2894a0f8a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -19,7 +21,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; - if (ds->phys_port_mask & (1 << addr)) + if (ds->phys_mii_mask & (1 << addr)) return ds->drv->phy_read(ds, addr, reg); return 0xffff; @@ -29,7 +31,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; - if (ds->phys_port_mask & (1 << addr)) + if (ds->phys_mii_mask & (1 << addr)) return ds->drv->phy_write(ds, addr, reg, val); return 0; @@ -312,7 +314,70 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; +static void dsa_slave_adjust_link(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + unsigned int status_changed = 0; + + if (p->old_link != p->phy->link) { + status_changed = 1; + p->old_link = p->phy->link; + } + + if (p->old_duplex != p->phy->duplex) { + status_changed = 1; + p->old_duplex = p->phy->duplex; + } + + if (p->old_pause != p->phy->pause) { + status_changed = 1; + p->old_pause = p->phy->pause; + } + + if (status_changed) + phy_print_status(p->phy); +} + /* slave device setup *******************************************************/ +static void dsa_slave_phy_setup(struct dsa_slave_priv *p, + struct net_device *slave_dev) +{ + struct dsa_switch *ds = p->parent; + struct dsa_chip_data *cd = ds->pd; + struct device_node *phy_dn, *port_dn; + int ret; + + port_dn = cd->port_dn[p->port]; + p->phy_interface = of_get_phy_mode(port_dn); + + phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); + if (of_phy_is_fixed_link(port_dn)) { + /* In the case of a fixed PHY, the DT node associated + * to the fixed PHY is the Port DT node + */ + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + pr_err("failed to register fixed PHY\n"); + return; + } + phy_dn = port_dn; + } + + if (phy_dn) + p->phy = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, 0, + p->phy_interface); + + /* We could not connect to a designated PHY, so use the switch internal + * MDIO bus instead + */ + if (!p->phy) + p->phy = ds->slave_mii_bus->phy_map[p->port]; + else + pr_info("attached PHY at address %d [%s]\n", + p->phy->addr, p->phy->drv->name); +} + struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) @@ -361,7 +426,12 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->dev = slave_dev; p->parent = ds; p->port = port; - p->phy = ds->slave_mii_bus->phy_map[port]; + + p->old_pause = -1; + p->old_link = -1; + p->old_duplex = -1; + + dsa_slave_phy_setup(p, slave_dev); ret = register_netdev(slave_dev); if (ret) { -- cgit v1.2.3 From 5aed85cec29882d1c4b4b2a01cb75a99efdbe4ed Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:52 -0700 Subject: net: dsa: allow switches to work without tagging In case switch port tagging is disabled (voluntarily, or the switch just does not support it), allow us to continue using the defined set of dsa_device_ops in net/dsa/slave.c. We introduce dsa_protocol_is_tagged() to check whether we need to override skb->protocol and go through the DSA-specifif packet_type function, or if we just go on and receive the SKB through the normal path. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- include/net/dsa.h | 5 +++++ net/dsa/slave.c | 19 ++++++++++++++++++- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1875dc71422a..429801370d0c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1784,10 +1784,10 @@ void dev_net_set(struct net_device *dev, struct net *net) static inline bool netdev_uses_dsa(struct net_device *dev) { #ifdef CONFIG_NET_DSA - return dev->dsa_ptr != NULL; -#else - return false; + if (dev->dsa_ptr != NULL) + return dsa_uses_tagged_protocol(dev->dsa_ptr); #endif + return false; } /** diff --git a/include/net/dsa.h b/include/net/dsa.h index dc357454ae3b..1035f6452d79 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -198,4 +198,9 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } +static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) +{ + return dst->tag_protocol != 0; +} + #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 03d2894a0f8a..241c2a1684cb 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -181,6 +181,17 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return dst->ops->xmit(skb, dev); } +static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; +} + /* ethtool operations *******************************************************/ static int @@ -314,6 +325,11 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; +static const struct dsa_device_ops notag_netdev_ops = { + .xmit = dsa_slave_notag_xmit, + .rcv = NULL, +}; + static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -415,7 +431,8 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, break; #endif default: - BUG(); + ds->dst->ops = ¬ag_netdev_ops; + break; } SET_NETDEV_DEV(slave_dev, parent); -- cgit v1.2.3 From ec9436baedb689668c409cfc8b69eb9573b0d661 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:53 -0700 Subject: net: dsa: allow drivers to do link adjustment Whenever libphy determines that the link status of a given PHY/port has changed, allow to call into the switch driver link adjustment callback so proper actions can be taken care of by the switch driver upon link notification. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ net/dsa/slave.c | 4 ++++ 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1035f6452d79..2d3835924dd2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -16,6 +16,7 @@ #include #include #include +#include #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -181,6 +182,12 @@ struct dsa_switch_driver { */ void (*poll_link)(struct dsa_switch *ds); + /* + * Link state adjustment (called from libphy) + */ + void (*adjust_link)(struct dsa_switch *ds, int port, + struct phy_device *phydev); + /* * ethtool hardware statistics. */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 241c2a1684cb..398d0663d3dd 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -333,6 +333,7 @@ static const struct dsa_device_ops notag_netdev_ops = { static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; unsigned int status_changed = 0; if (p->old_link != p->phy->link) { @@ -350,6 +351,9 @@ static void dsa_slave_adjust_link(struct net_device *dev) p->old_pause = p->phy->pause; } + if (ds->drv->adjust_link && status_changed) + ds->drv->adjust_link(ds, p->port, p->phy); + if (status_changed) phy_print_status(p->phy); } -- cgit v1.2.3 From ce31b31c68e7e39f29b1257581fbd08ce3ca5589 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:54 -0700 Subject: net: dsa: allow updating fixed PHY link information Allow switch drivers to hook a PHY link update callback to perform port-specific link work. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/slave.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2d3835924dd2..2c9563f0b2f4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -17,6 +17,7 @@ #include #include #include +#include #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -187,6 +188,8 @@ struct dsa_switch_driver { */ void (*adjust_link)(struct dsa_switch *ds, int port, struct phy_device *phydev); + void (*fixed_link_update)(struct dsa_switch *ds, int port, + struct fixed_phy_status *st); /* * ethtool hardware statistics. diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 398d0663d3dd..18ff53836fe3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -358,6 +358,18 @@ static void dsa_slave_adjust_link(struct net_device *dev) phy_print_status(p->phy); } +static int dsa_slave_fixed_link_update(struct net_device *dev, + struct fixed_phy_status *status) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->fixed_link_update) + ds->drv->fixed_link_update(ds, p->port, status); + + return 0; +} + /* slave device setup *******************************************************/ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) @@ -365,6 +377,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, struct dsa_switch *ds = p->parent; struct dsa_chip_data *cd = ds->pd; struct device_node *phy_dn, *port_dn; + bool phy_is_fixed = false; int ret; port_dn = cd->port_dn[p->port]; @@ -380,6 +393,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, pr_err("failed to register fixed PHY\n"); return; } + phy_is_fixed = true; phy_dn = port_dn; } @@ -388,6 +402,9 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, dsa_slave_adjust_link, 0, p->phy_interface); + if (p->phy && phy_is_fixed) + fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ -- cgit v1.2.3 From 5037d532b83d7325a2743dffe82882a64697a8e8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:55 -0700 Subject: net: dsa: add Broadcom tag RX/TX handler Add support for the 4-bytes Broadcom tag that built-in switches such as the Starfighter 2 might insert when receiving packets, or that we need to insert while targetting specific switch ports. We use a fake local EtherType value for this 4-bytes switch tag: ETH_P_BRCMTAG to make sure we can assign DSA-specific network operations within the DSA drivers. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ++ net/dsa/Kconfig | 3 + net/dsa/Makefile | 1 + net/dsa/dsa_priv.h | 3 + net/dsa/slave.c | 5 ++ net/dsa/tag_brcm.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 190 insertions(+) create mode 100644 net/dsa/tag_brcm.c (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2c9563f0b2f4..97712927a9d2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -19,6 +19,11 @@ #include #include +/* Not an official ethertype value, used only internally for DSA + * demultiplexing + */ +#define ETH_P_BRCMTAG (ETH_P_XDSA + 1) + #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index f5eede1d6cb8..a585fd6352eb 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -12,6 +12,9 @@ config NET_DSA if NET_DSA # tagging formats +config NET_DSA_TAG_BRCM + bool + config NET_DSA_TAG_DSA bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 7b9fcbbeda5d..da06ed1df620 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += dsa.o slave.o # tagging formats +dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d20364ac1574..98afed4d92ba 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -57,5 +57,8 @@ extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ extern const struct dsa_device_ops trailer_netdev_ops; +/* tag_brcm.c */ +extern const struct dsa_device_ops brcm_netdev_ops; + #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 18ff53836fe3..7333a4aebb7d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -450,6 +450,11 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, case htons(ETH_P_TRAILER): ds->dst->ops = &trailer_netdev_ops; break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case htons(ETH_P_BRCMTAG): + ds->dst->ops = &brcm_netdev_ops; + break; #endif default: ds->dst->ops = ¬ag_netdev_ops; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c new file mode 100644 index 000000000000..e0b759ec209e --- /dev/null +++ b/net/dsa/tag_brcm.c @@ -0,0 +1,173 @@ +/* + * Broadcom tag support + * + * Copyright (C) 2014 Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +/* This tag length is 4 bytes, older ones were 6 bytes, we do not + * handle them + */ +#define BRCM_TAG_LEN 4 + +/* Tag is constructed and desconstructed using byte by byte access + * because the tag is placed after the MAC Source Address, which does + * not make it 4-bytes aligned, so this might cause unaligned accesses + * on most systems where this is used. + */ + +/* Ingress and egress opcodes */ +#define BRCM_OPCODE_SHIFT 5 +#define BRCM_OPCODE_MASK 0x7 + +/* Ingress fields */ +/* 1st byte in the tag */ +#define BRCM_IG_TC_SHIFT 2 +#define BRCM_IG_TC_MASK 0x7 +/* 2nd byte in the tag */ +#define BRCM_IG_TE_MASK 0x3 +#define BRCM_IG_TS_SHIFT 7 +/* 3rd byte in the tag */ +#define BRCM_IG_DSTMAP2_MASK 1 +#define BRCM_IG_DSTMAP1_MASK 0xff + +/* Egress fields */ + +/* 2nd byte in the tag */ +#define BRCM_EG_CID_MASK 0xff + +/* 3rd byte in the tag */ +#define BRCM_EG_RC_MASK 0xff +#define BRCM_EG_RC_RSVD (3 << 6) +#define BRCM_EG_RC_EXCEPTION (1 << 5) +#define BRCM_EG_RC_PROT_SNOOP (1 << 4) +#define BRCM_EG_RC_PROT_TERM (1 << 3) +#define BRCM_EG_RC_SWITCH (1 << 2) +#define BRCM_EG_RC_MAC_LEARN (1 << 1) +#define BRCM_EG_RC_MIRROR (1 << 0) +#define BRCM_EG_TC_SHIFT 5 +#define BRCM_EG_TC_MASK 0x7 +#define BRCM_EG_PID_MASK 0x1f + +static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u8 *brcm_tag; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) + goto out_free; + + skb_push(skb, BRCM_TAG_LEN); + + memmove(skb->data, skb->data + BRCM_TAG_LEN, 2 * ETH_ALEN); + + /* Build the tag after the MAC Source Address */ + brcm_tag = skb->data + 2 * ETH_ALEN; + + /* Set the ingress opcode, traffic class, tag enforcment is + * deprecated + */ + brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) | + ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK); + brcm_tag[1] = 0; + brcm_tag[2] = 0; + if (p->port == 8) + brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; + brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; + + /* Queue the SKB for transmission on the parent interface, but + * do not modify its EtherType + */ + skb->protocol = htons(ETH_P_BRCMTAG); + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; + +out_free: + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + int source_port; + u8 *brcm_tag; + + if (unlikely(dst == NULL)) + goto out_drop; + + ds = dst->ds[0]; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb == NULL) + goto out; + + if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) + goto out_drop; + + /* skb->data points to the EtherType, the tag is right before it */ + brcm_tag = skb->data - 2; + + /* The opcode should never be different than 0b000 */ + if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) + goto out_drop; + + /* We should never see a reserved reason code without knowing how to + * handle it + */ + WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD); + + /* Locate which port this is coming from */ + source_port = brcm_tag[3] & BRCM_EG_PID_MASK; + + /* Validate port against switch setup, either the port is totally */ + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + goto out_drop; + + /* Remove Broadcom tag and update checksum */ + skb_pull_rcsum(skb, BRCM_TAG_LEN); + + /* Move the Ethernet DA and SA */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - BRCM_TAG_LEN, + 2 * ETH_ALEN); + + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->dev = ds->ports[source_port]; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +const struct dsa_device_ops brcm_netdev_ops = { + .xmit = brcm_tag_xmit, + .rcv = brcm_tag_rcv, +}; -- cgit v1.2.3 From 246d7f773c13cac3e3ab1609fd4ffee520242c63 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:56 -0700 Subject: net: dsa: add Broadcom SF2 switch driver Add support for the Broadcom Starfigther 2 switch chip using a DSA driver. This switch driver supports the following features: - configuration of the external switch port interface: MII, RevMII, RGMII and RGMII_NO_ID are supported - support for the per-port MIB counters - support for link interrupts for special ports (e.g: MoCA) - powering up/down of switch memories to conserve power when ports are unused Finally, update the compatible property for the DSA core code to match our switch top-level compatible node. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/Kconfig | 11 + drivers/net/dsa/Makefile | 1 + drivers/net/dsa/bcm_sf2.c | 626 +++++++++++++++++++++++++++++++++++++++++ drivers/net/dsa/bcm_sf2.h | 140 +++++++++ drivers/net/dsa/bcm_sf2_regs.h | 227 +++++++++++++++ net/dsa/dsa.c | 1 + 6 files changed, 1006 insertions(+) create mode 100644 drivers/net/dsa/bcm_sf2.c create mode 100644 drivers/net/dsa/bcm_sf2.h create mode 100644 drivers/net/dsa/bcm_sf2_regs.h (limited to 'net') diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index b8fe808b7957..c6ee07c6a1b5 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -36,4 +36,15 @@ config NET_DSA_MV88E6123_61_65 This enables support for the Marvell 88E6123/6161/6165 ethernet switch chips. +config NET_DSA_BCM_SF2 + tristate "Broadcom Starfighter 2 Ethernet switch support" + select NET_DSA + select NET_DSA_TAG_BRCM + select FIXED_PHY if NET_DSA_BCM_SF2=y + select BCM7XXX_PHY + select MDIO_BCM_UNIMAC + ---help--- + This enables support for the Broadcom Starfighter 2 Ethernet + switch chips. + endmenu diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile index f3bda05536cc..dd3cd3b8157f 100644 --- a/drivers/net/dsa/Makefile +++ b/drivers/net/dsa/Makefile @@ -7,3 +7,4 @@ endif ifdef CONFIG_NET_DSA_MV88E6131 mv88e6xxx_drv-y += mv88e6131.o endif +obj-$(CONFIG_NET_DSA_BCM_SF2) += bcm_sf2.o diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c new file mode 100644 index 000000000000..bb7cb8e283b1 --- /dev/null +++ b/drivers/net/dsa/bcm_sf2.c @@ -0,0 +1,626 @@ +/* + * Broadcom Starfighter 2 DSA switch driver + * + * Copyright (C) 2014, Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bcm_sf2.h" +#include "bcm_sf2_regs.h" + +/* String, offset, and register size in bytes if different from 4 bytes */ +static const struct bcm_sf2_hw_stats bcm_sf2_mib[] = { + { "TxOctets", 0x000, 8 }, + { "TxDropPkts", 0x020 }, + { "TxQPKTQ0", 0x030 }, + { "TxBroadcastPkts", 0x040 }, + { "TxMulticastPkts", 0x050 }, + { "TxUnicastPKts", 0x060 }, + { "TxCollisions", 0x070 }, + { "TxSingleCollision", 0x080 }, + { "TxMultipleCollision", 0x090 }, + { "TxDeferredCollision", 0x0a0 }, + { "TxLateCollision", 0x0b0 }, + { "TxExcessiveCollision", 0x0c0 }, + { "TxFrameInDisc", 0x0d0 }, + { "TxPausePkts", 0x0e0 }, + { "TxQPKTQ1", 0x0f0 }, + { "TxQPKTQ2", 0x100 }, + { "TxQPKTQ3", 0x110 }, + { "TxQPKTQ4", 0x120 }, + { "TxQPKTQ5", 0x130 }, + { "RxOctets", 0x140, 8 }, + { "RxUndersizePkts", 0x160 }, + { "RxPausePkts", 0x170 }, + { "RxPkts64Octets", 0x180 }, + { "RxPkts65to127Octets", 0x190 }, + { "RxPkts128to255Octets", 0x1a0 }, + { "RxPkts256to511Octets", 0x1b0 }, + { "RxPkts512to1023Octets", 0x1c0 }, + { "RxPkts1024toMaxPktsOctets", 0x1d0 }, + { "RxOversizePkts", 0x1e0 }, + { "RxJabbers", 0x1f0 }, + { "RxAlignmentErrors", 0x200 }, + { "RxFCSErrors", 0x210 }, + { "RxGoodOctets", 0x220, 8 }, + { "RxDropPkts", 0x240 }, + { "RxUnicastPkts", 0x250 }, + { "RxMulticastPkts", 0x260 }, + { "RxBroadcastPkts", 0x270 }, + { "RxSAChanges", 0x280 }, + { "RxFragments", 0x290 }, + { "RxJumboPkt", 0x2a0 }, + { "RxSymblErr", 0x2b0 }, + { "InRangeErrCount", 0x2c0 }, + { "OutRangeErrCount", 0x2d0 }, + { "EEELpiEvent", 0x2e0 }, + { "EEELpiDuration", 0x2f0 }, + { "RxDiscard", 0x300, 8 }, + { "TxQPKTQ6", 0x320 }, + { "TxQPKTQ7", 0x330 }, + { "TxPkts64Octets", 0x340 }, + { "TxPkts65to127Octets", 0x350 }, + { "TxPkts128to255Octets", 0x360 }, + { "TxPkts256to511Ocets", 0x370 }, + { "TxPkts512to1023Ocets", 0x380 }, + { "TxPkts1024toMaxPktOcets", 0x390 }, +}; + +#define BCM_SF2_STATS_SIZE ARRAY_SIZE(bcm_sf2_mib) + +static void bcm_sf2_sw_get_strings(struct dsa_switch *ds, + int port, uint8_t *data) +{ + unsigned int i; + + for (i = 0; i < BCM_SF2_STATS_SIZE; i++) + memcpy(data + i * ETH_GSTRING_LEN, + bcm_sf2_mib[i].string, ETH_GSTRING_LEN); +} + +static void bcm_sf2_sw_get_ethtool_stats(struct dsa_switch *ds, + int port, uint64_t *data) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + const struct bcm_sf2_hw_stats *s; + unsigned int i; + u64 val = 0; + u32 offset; + + mutex_lock(&priv->stats_mutex); + + /* Now fetch the per-port counters */ + for (i = 0; i < BCM_SF2_STATS_SIZE; i++) { + s = &bcm_sf2_mib[i]; + + /* Do a latched 64-bit read if needed */ + offset = s->reg + CORE_P_MIB_OFFSET(port); + if (s->sizeof_stat == 8) + val = core_readq(priv, offset); + else + val = core_readl(priv, offset); + + data[i] = (u64)val; + } + + mutex_unlock(&priv->stats_mutex); +} + +static int bcm_sf2_sw_get_sset_count(struct dsa_switch *ds) +{ + return BCM_SF2_STATS_SIZE; +} + +static char *bcm_sf2_sw_probe(struct mii_bus *bus, int sw_addr) +{ + return "Broadcom Starfighter 2"; +} + +static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + unsigned int i; + u32 reg, val; + + /* Enable the port memories */ + reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL); + reg &= ~P_TXQ_PSM_VDD(port); + core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); + + /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */ + reg = core_readl(priv, CORE_IMP_CTL); + reg |= (RX_BCST_EN | RX_MCST_EN | RX_UCST_EN); + reg &= ~(RX_DIS | TX_DIS); + core_writel(priv, reg, CORE_IMP_CTL); + + /* Enable forwarding */ + core_writel(priv, SW_FWDG_EN, CORE_SWMODE); + + /* Enable IMP port in dumb mode */ + reg = core_readl(priv, CORE_SWITCH_CTRL); + reg |= MII_DUMB_FWDG_EN; + core_writel(priv, reg, CORE_SWITCH_CTRL); + + /* Resolve which bit controls the Broadcom tag */ + switch (port) { + case 8: + val = BRCM_HDR_EN_P8; + break; + case 7: + val = BRCM_HDR_EN_P7; + break; + case 5: + val = BRCM_HDR_EN_P5; + break; + default: + val = 0; + break; + } + + /* Enable Broadcom tags for IMP port */ + reg = core_readl(priv, CORE_BRCM_HDR_CTRL); + reg |= val; + core_writel(priv, reg, CORE_BRCM_HDR_CTRL); + + /* Enable reception Broadcom tag for CPU TX (switch RX) to + * allow us to tag outgoing frames + */ + reg = core_readl(priv, CORE_BRCM_HDR_RX_DIS); + reg &= ~(1 << port); + core_writel(priv, reg, CORE_BRCM_HDR_RX_DIS); + + /* Enable transmission of Broadcom tags from the switch (CPU RX) to + * allow delivering frames to the per-port net_devices + */ + reg = core_readl(priv, CORE_BRCM_HDR_TX_DIS); + reg &= ~(1 << port); + core_writel(priv, reg, CORE_BRCM_HDR_TX_DIS); + + /* Force link status for IMP port */ + reg = core_readl(priv, CORE_STS_OVERRIDE_IMP); + reg |= (MII_SW_OR | LINK_STS); + core_writel(priv, reg, CORE_STS_OVERRIDE_IMP); + + /* Enable the IMP Port to be in the same VLAN as the other ports + * on a per-port basis such that we only have Port i and IMP in + * the same VLAN. + */ + for (i = 0; i < priv->hw_params.num_ports; i++) { + if (!((1 << i) & ds->phys_port_mask)) + continue; + + reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i)); + reg |= (1 << port); + core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i)); + } +} + +static void bcm_sf2_port_setup(struct dsa_switch *ds, int port) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + u32 reg; + + /* Clear the memory power down */ + reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL); + reg &= ~P_TXQ_PSM_VDD(port); + core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); + + /* Clear the Rx and Tx disable bits and set to no spanning tree */ + core_writel(priv, 0, CORE_G_PCTL_PORT(port)); + + /* Enable port 7 interrupts to get notified */ + if (port == 7) + intrl2_1_mask_clear(priv, P_IRQ_MASK(P7_IRQ_OFF)); + + /* Set this port, and only this one to be in the default VLAN */ + reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port)); + reg &= ~PORT_VLAN_CTRL_MASK; + reg |= (1 << port); + core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port)); +} + +static void bcm_sf2_port_disable(struct dsa_switch *ds, int port) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + u32 off, reg; + + if (dsa_is_cpu_port(ds, port)) + off = CORE_IMP_CTL; + else + off = CORE_G_PCTL_PORT(port); + + reg = core_readl(priv, off); + reg |= RX_DIS | TX_DIS; + core_writel(priv, reg, off); + + /* Power down the port memory */ + reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL); + reg |= P_TXQ_PSM_VDD(port); + core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); +} + +static irqreturn_t bcm_sf2_switch_0_isr(int irq, void *dev_id) +{ + struct bcm_sf2_priv *priv = dev_id; + + priv->irq0_stat = intrl2_0_readl(priv, INTRL2_CPU_STATUS) & + ~priv->irq0_mask; + intrl2_0_writel(priv, priv->irq0_stat, INTRL2_CPU_CLEAR); + + return IRQ_HANDLED; +} + +static irqreturn_t bcm_sf2_switch_1_isr(int irq, void *dev_id) +{ + struct bcm_sf2_priv *priv = dev_id; + + priv->irq1_stat = intrl2_1_readl(priv, INTRL2_CPU_STATUS) & + ~priv->irq1_mask; + intrl2_1_writel(priv, priv->irq1_stat, INTRL2_CPU_CLEAR); + + if (priv->irq1_stat & P_LINK_UP_IRQ(P7_IRQ_OFF)) + priv->port_sts[7].link = 1; + if (priv->irq1_stat & P_LINK_DOWN_IRQ(P7_IRQ_OFF)) + priv->port_sts[7].link = 0; + + return IRQ_HANDLED; +} + +static int bcm_sf2_sw_setup(struct dsa_switch *ds) +{ + const char *reg_names[BCM_SF2_REGS_NUM] = BCM_SF2_REGS_NAME; + struct bcm_sf2_priv *priv = ds_to_priv(ds); + struct device_node *dn; + void __iomem **base; + unsigned int port; + unsigned int i; + u32 reg, rev; + int ret; + + spin_lock_init(&priv->indir_lock); + mutex_init(&priv->stats_mutex); + + /* All the interesting properties are at the parent device_node + * level + */ + dn = ds->pd->of_node->parent; + + priv->irq0 = irq_of_parse_and_map(dn, 0); + priv->irq1 = irq_of_parse_and_map(dn, 1); + + base = &priv->core; + for (i = 0; i < BCM_SF2_REGS_NUM; i++) { + *base = of_iomap(dn, i); + if (*base == NULL) { + pr_err("unable to find register: %s\n", reg_names[i]); + return -ENODEV; + } + base++; + } + + /* Disable all interrupts and request them */ + intrl2_0_writel(priv, 0xffffffff, INTRL2_CPU_MASK_SET); + intrl2_0_writel(priv, 0xffffffff, INTRL2_CPU_CLEAR); + intrl2_0_writel(priv, 0, INTRL2_CPU_MASK_CLEAR); + intrl2_1_writel(priv, 0xffffffff, INTRL2_CPU_MASK_SET); + intrl2_1_writel(priv, 0xffffffff, INTRL2_CPU_CLEAR); + intrl2_1_writel(priv, 0, INTRL2_CPU_MASK_CLEAR); + + ret = request_irq(priv->irq0, bcm_sf2_switch_0_isr, 0, + "switch_0", priv); + if (ret < 0) { + pr_err("failed to request switch_0 IRQ\n"); + goto out_unmap; + } + + ret = request_irq(priv->irq1, bcm_sf2_switch_1_isr, 0, + "switch_1", priv); + if (ret < 0) { + pr_err("failed to request switch_1 IRQ\n"); + goto out_free_irq0; + } + + /* Reset the MIB counters */ + reg = core_readl(priv, CORE_GMNCFGCFG); + reg |= RST_MIB_CNT; + core_writel(priv, reg, CORE_GMNCFGCFG); + reg &= ~RST_MIB_CNT; + core_writel(priv, reg, CORE_GMNCFGCFG); + + /* Get the maximum number of ports for this switch */ + priv->hw_params.num_ports = core_readl(priv, CORE_IMP0_PRT_ID) + 1; + if (priv->hw_params.num_ports > DSA_MAX_PORTS) + priv->hw_params.num_ports = DSA_MAX_PORTS; + + /* Assume a single GPHY setup if we can't read that property */ + if (of_property_read_u32(dn, "brcm,num-gphy", + &priv->hw_params.num_gphy)) + priv->hw_params.num_gphy = 1; + + /* Enable all valid ports and disable those unused */ + for (port = 0; port < priv->hw_params.num_ports; port++) { + /* IMP port receives special treatment */ + if ((1 << port) & ds->phys_port_mask) + bcm_sf2_port_setup(ds, port); + else if (dsa_is_cpu_port(ds, port)) + bcm_sf2_imp_setup(ds, port); + else + bcm_sf2_port_disable(ds, port); + } + + /* Include the pseudo-PHY address and the broadcast PHY address to + * divert reads towards our workaround + */ + ds->phys_mii_mask |= ((1 << 30) | (1 << 0)); + + rev = reg_readl(priv, REG_SWITCH_REVISION); + priv->hw_params.top_rev = (rev >> SWITCH_TOP_REV_SHIFT) & + SWITCH_TOP_REV_MASK; + priv->hw_params.core_rev = (rev & SF2_REV_MASK); + + pr_info("Starfighter 2 top: %x.%02x, core: %x.%02x base: 0x%p, IRQs: %d, %d\n", + priv->hw_params.top_rev >> 8, priv->hw_params.top_rev & 0xff, + priv->hw_params.core_rev >> 8, priv->hw_params.core_rev & 0xff, + priv->core, priv->irq0, priv->irq1); + + return 0; + +out_free_irq0: + free_irq(priv->irq0, priv); +out_unmap: + base = &priv->core; + for (i = 0; i < BCM_SF2_REGS_NUM; i++) { + iounmap(*base); + base++; + } + return ret; +} + +static int bcm_sf2_sw_set_addr(struct dsa_switch *ds, u8 *addr) +{ + return 0; +} + +static int bcm_sf2_sw_indir_rw(struct dsa_switch *ds, int op, int addr, + int regnum, u16 val) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + int ret = 0; + u32 reg; + + reg = reg_readl(priv, REG_SWITCH_CNTRL); + reg |= MDIO_MASTER_SEL; + reg_writel(priv, reg, REG_SWITCH_CNTRL); + + /* Page << 8 | offset */ + reg = 0x70; + reg <<= 2; + core_writel(priv, addr, reg); + + /* Page << 8 | offset */ + reg = 0x80 << 8 | regnum << 1; + reg <<= 2; + + if (op) + ret = core_readl(priv, reg); + else + core_writel(priv, val, reg); + + reg = reg_readl(priv, REG_SWITCH_CNTRL); + reg &= ~MDIO_MASTER_SEL; + reg_writel(priv, reg, REG_SWITCH_CNTRL); + + return ret & 0xffff; +} + +static int bcm_sf2_sw_phy_read(struct dsa_switch *ds, int addr, int regnum) +{ + /* Intercept reads from the MDIO broadcast address or Broadcom + * pseudo-PHY address + */ + switch (addr) { + case 0: + case 30: + return bcm_sf2_sw_indir_rw(ds, 1, addr, regnum, 0); + default: + return 0xffff; + } +} + +static int bcm_sf2_sw_phy_write(struct dsa_switch *ds, int addr, int regnum, + u16 val) +{ + /* Intercept writes to the MDIO broadcast address or Broadcom + * pseudo-PHY address + */ + switch (addr) { + case 0: + case 30: + bcm_sf2_sw_indir_rw(ds, 0, addr, regnum, val); + break; + } + + return 0; +} + +static void bcm_sf2_sw_adjust_link(struct dsa_switch *ds, int port, + struct phy_device *phydev) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + u32 id_mode_dis = 0, port_mode; + const char *str = NULL; + u32 reg; + + switch (phydev->interface) { + case PHY_INTERFACE_MODE_RGMII: + str = "RGMII (no delay)"; + id_mode_dis = 1; + case PHY_INTERFACE_MODE_RGMII_TXID: + if (!str) + str = "RGMII (TX delay)"; + port_mode = EXT_GPHY; + break; + case PHY_INTERFACE_MODE_MII: + str = "MII"; + port_mode = EXT_EPHY; + break; + case PHY_INTERFACE_MODE_REVMII: + str = "Reverse MII"; + port_mode = EXT_REVMII; + break; + default: + goto force_link; + } + + /* Clear id_mode_dis bit, and the existing port mode, but + * make sure we enable the RGMII block for data to pass + */ + reg = reg_readl(priv, REG_RGMII_CNTRL_P(port)); + reg &= ~ID_MODE_DIS; + reg &= ~(PORT_MODE_MASK << PORT_MODE_SHIFT); + reg &= ~(RX_PAUSE_EN | TX_PAUSE_EN); + + reg |= port_mode | RGMII_MODE_EN; + if (id_mode_dis) + reg |= ID_MODE_DIS; + + if (phydev->pause) { + if (phydev->asym_pause) + reg |= TX_PAUSE_EN; + reg |= RX_PAUSE_EN; + } + + reg_writel(priv, reg, REG_RGMII_CNTRL_P(port)); + + pr_info("Port %d configured for %s\n", port, str); + +force_link: + /* Force link settings detected from the PHY */ + reg = SW_OVERRIDE; + switch (phydev->speed) { + case SPEED_1000: + reg |= SPDSTS_1000 << SPEED_SHIFT; + break; + case SPEED_100: + reg |= SPDSTS_100 << SPEED_SHIFT; + break; + } + + if (phydev->link) + reg |= LINK_STS; + if (phydev->duplex == DUPLEX_FULL) + reg |= DUPLX_MODE; + + core_writel(priv, reg, CORE_STS_OVERRIDE_GMIIP_PORT(port)); +} + +static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port, + struct fixed_phy_status *status) +{ + struct bcm_sf2_priv *priv = ds_to_priv(ds); + u32 link, duplex, pause, speed; + u32 reg; + + link = core_readl(priv, CORE_LNKSTS); + duplex = core_readl(priv, CORE_DUPSTS); + pause = core_readl(priv, CORE_PAUSESTS); + speed = core_readl(priv, CORE_SPDSTS); + + speed >>= (port * SPDSTS_SHIFT); + speed &= SPDSTS_MASK; + + status->link = 0; + + /* Port 7 is special as we do not get link status from CORE_LNKSTS, + * which means that we need to force the link at the port override + * level to get the data to flow. We do use what the interrupt handler + * did determine before. + */ + if (port == 7) { + status->link = priv->port_sts[port].link; + reg = core_readl(priv, CORE_STS_OVERRIDE_GMIIP_PORT(7)); + reg |= SW_OVERRIDE; + if (status->link) + reg |= LINK_STS; + else + reg &= ~LINK_STS; + core_writel(priv, reg, CORE_STS_OVERRIDE_GMIIP_PORT(7)); + status->duplex = 1; + } else { + status->link = !!(link & (1 << port)); + status->duplex = !!(duplex & (1 << port)); + } + + switch (speed) { + case SPDSTS_10: + status->speed = SPEED_10; + break; + case SPDSTS_100: + status->speed = SPEED_100; + break; + case SPDSTS_1000: + status->speed = SPEED_1000; + break; + } + + if ((pause & (1 << port)) && + (pause & (1 << (port + PAUSESTS_TX_PAUSE_SHIFT)))) { + status->asym_pause = 1; + status->pause = 1; + } + + if (pause & (1 << port)) + status->pause = 1; +} + +static struct dsa_switch_driver bcm_sf2_switch_driver = { + .tag_protocol = htons(ETH_P_BRCMTAG), + .priv_size = sizeof(struct bcm_sf2_priv), + .probe = bcm_sf2_sw_probe, + .setup = bcm_sf2_sw_setup, + .set_addr = bcm_sf2_sw_set_addr, + .phy_read = bcm_sf2_sw_phy_read, + .phy_write = bcm_sf2_sw_phy_write, + .get_strings = bcm_sf2_sw_get_strings, + .get_ethtool_stats = bcm_sf2_sw_get_ethtool_stats, + .get_sset_count = bcm_sf2_sw_get_sset_count, + .adjust_link = bcm_sf2_sw_adjust_link, + .fixed_link_update = bcm_sf2_sw_fixed_link_update, +}; + +static int __init bcm_sf2_init(void) +{ + register_switch_driver(&bcm_sf2_switch_driver); + + return 0; +} +module_init(bcm_sf2_init); + +static void __exit bcm_sf2_exit(void) +{ + unregister_switch_driver(&bcm_sf2_switch_driver); +} +module_exit(bcm_sf2_exit); + +MODULE_AUTHOR("Broadcom Corporation"); +MODULE_DESCRIPTION("Driver for Broadcom Starfighter 2 ethernet switch chip"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:brcm-sf2"); diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h new file mode 100644 index 000000000000..260bab313e58 --- /dev/null +++ b/drivers/net/dsa/bcm_sf2.h @@ -0,0 +1,140 @@ +/* + * Broadcom Starfighter2 private context + * + * Copyright (C) 2014, Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __BCM_SF2_H +#define __BCM_SF2_H + +#include +#include +#include +#include +#include +#include + +#include + +#include "bcm_sf2_regs.h" + +struct bcm_sf2_hw_params { + u16 top_rev; + u16 core_rev; + u32 num_gphy; + u8 num_acb_queue; + u8 num_rgmii; + u8 num_ports; + u8 fcb_pause_override:1; + u8 acb_packets_inflight:1; +}; + +#define BCM_SF2_REGS_NAME {\ + "core", "reg", "intrl2_0", "intrl2_1", "fcb", "acb" \ +} + +#define BCM_SF2_REGS_NUM 6 + +struct bcm_sf2_port_status { + unsigned int link; +}; + +struct bcm_sf2_priv { + /* Base registers, keep those in order with BCM_SF2_REGS_NAME */ + void __iomem *core; + void __iomem *reg; + void __iomem *intrl2_0; + void __iomem *intrl2_1; + void __iomem *fcb; + void __iomem *acb; + + /* spinlock protecting access to the indirect registers */ + spinlock_t indir_lock; + + int irq0; + int irq1; + u32 irq0_stat; + u32 irq0_mask; + u32 irq1_stat; + u32 irq1_mask; + + /* Mutex protecting access to the MIB counters */ + struct mutex stats_mutex; + + struct bcm_sf2_hw_params hw_params; + + struct bcm_sf2_port_status port_sts[DSA_MAX_PORTS]; +}; + +struct bcm_sf2_hw_stats { + const char *string; + u16 reg; + u8 sizeof_stat; +}; + +#define SF2_IO_MACRO(name) \ +static inline u32 name##_readl(struct bcm_sf2_priv *priv, u32 off) \ +{ \ + return __raw_readl(priv->name + off); \ +} \ +static inline void name##_writel(struct bcm_sf2_priv *priv, \ + u32 val, u32 off) \ +{ \ + __raw_writel(val, priv->name + off); \ +} \ + +/* Accesses to 64-bits register requires us to latch the hi/lo pairs + * using the REG_DIR_DATA_{READ,WRITE} ancillary registers. The 'indir_lock' + * spinlock is automatically grabbed and released to provide relative + * atomiticy with latched reads/writes. + */ +#define SF2_IO64_MACRO(name) \ +static inline u64 name##_readq(struct bcm_sf2_priv *priv, u32 off) \ +{ \ + u32 indir, dir; \ + spin_lock(&priv->indir_lock); \ + indir = reg_readl(priv, REG_DIR_DATA_READ); \ + dir = __raw_readl(priv->name + off); \ + spin_unlock(&priv->indir_lock); \ + return (u64)indir << 32 | dir; \ +} \ +static inline void name##_writeq(struct bcm_sf2_priv *priv, u32 off, \ + u64 val) \ +{ \ + spin_lock(&priv->indir_lock); \ + reg_writel(priv, upper_32_bits(val), REG_DIR_DATA_WRITE); \ + __raw_writel(lower_32_bits(val), priv->name + off); \ + spin_unlock(&priv->indir_lock); \ +} + +#define SWITCH_INTR_L2(which) \ +static inline void intrl2_##which##_mask_clear(struct bcm_sf2_priv *priv, \ + u32 mask) \ +{ \ + intrl2_##which##_writel(priv, mask, INTRL2_CPU_MASK_CLEAR); \ + priv->irq##which##_mask &= ~(mask); \ +} \ +static inline void intrl2_##which##_mask_set(struct bcm_sf2_priv *priv, \ + u32 mask) \ +{ \ + intrl2_## which##_writel(priv, mask, INTRL2_CPU_MASK_SET); \ + priv->irq##which##_mask |= (mask); \ +} \ + +SF2_IO_MACRO(core); +SF2_IO_MACRO(reg); +SF2_IO64_MACRO(core); +SF2_IO_MACRO(intrl2_0); +SF2_IO_MACRO(intrl2_1); +SF2_IO_MACRO(fcb); +SF2_IO_MACRO(acb); + +SWITCH_INTR_L2(0); +SWITCH_INTR_L2(1); + +#endif /* __BCM_SF2_H */ diff --git a/drivers/net/dsa/bcm_sf2_regs.h b/drivers/net/dsa/bcm_sf2_regs.h new file mode 100644 index 000000000000..885c231b03b5 --- /dev/null +++ b/drivers/net/dsa/bcm_sf2_regs.h @@ -0,0 +1,227 @@ +/* + * Broadcom Starfighter 2 switch register defines + * + * Copyright (C) 2014, Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#ifndef __BCM_SF2_REGS_H +#define __BCM_SF2_REGS_H + +/* Register set relative to 'REG' */ +#define REG_SWITCH_CNTRL 0x00 +#define MDIO_MASTER_SEL (1 << 0) + +#define REG_SWITCH_STATUS 0x04 +#define REG_DIR_DATA_WRITE 0x08 +#define REG_DIR_DATA_READ 0x0C + +#define REG_SWITCH_REVISION 0x18 +#define SF2_REV_MASK 0xffff +#define SWITCH_TOP_REV_SHIFT 16 +#define SWITCH_TOP_REV_MASK 0xffff + +#define REG_PHY_REVISION 0x1C + +#define REG_SPHY_CNTRL 0x2C +#define IDDQ_BIAS (1 << 0) +#define EXT_PWR_DOWN (1 << 1) +#define FORCE_DLL_EN (1 << 2) +#define IDDQ_GLOBAL_PWR (1 << 3) +#define CK25_DIS (1 << 4) +#define PHY_RESET (1 << 5) +#define PHY_PHYAD_SHIFT 8 +#define PHY_PHYAD_MASK 0x1F + +#define REG_RGMII_0_BASE 0x34 +#define REG_RGMII_CNTRL 0x00 +#define REG_RGMII_IB_STATUS 0x04 +#define REG_RGMII_RX_CLOCK_DELAY_CNTRL 0x08 +#define REG_RGMII_CNTRL_SIZE 0x0C +#define REG_RGMII_CNTRL_P(x) (REG_RGMII_0_BASE + \ + ((x) * REG_RGMII_CNTRL_SIZE)) +/* Relative to REG_RGMII_CNTRL */ +#define RGMII_MODE_EN (1 << 0) +#define ID_MODE_DIS (1 << 1) +#define PORT_MODE_SHIFT 2 +#define INT_EPHY (0 << PORT_MODE_SHIFT) +#define INT_GPHY (1 << PORT_MODE_SHIFT) +#define EXT_EPHY (2 << PORT_MODE_SHIFT) +#define EXT_GPHY (3 << PORT_MODE_SHIFT) +#define EXT_REVMII (4 << PORT_MODE_SHIFT) +#define PORT_MODE_MASK 0x7 +#define RVMII_REF_SEL (1 << 5) +#define RX_PAUSE_EN (1 << 6) +#define TX_PAUSE_EN (1 << 7) +#define TX_CLK_STOP_EN (1 << 8) +#define LPI_COUNT_SHIFT 9 +#define LPI_COUNT_MASK 0x3F + +/* Register set relative to 'INTRL2_0' and 'INTRL2_1' */ +#define INTRL2_CPU_STATUS 0x00 +#define INTRL2_CPU_SET 0x04 +#define INTRL2_CPU_CLEAR 0x08 +#define INTRL2_CPU_MASK_STATUS 0x0c +#define INTRL2_CPU_MASK_SET 0x10 +#define INTRL2_CPU_MASK_CLEAR 0x14 + +/* Shared INTRL2_0 and INTRL2_ interrupt sources macros */ +#define P_LINK_UP_IRQ(x) (1 << (0 + (x))) +#define P_LINK_DOWN_IRQ(x) (1 << (1 + (x))) +#define P_ENERGY_ON_IRQ(x) (1 << (2 + (x))) +#define P_ENERGY_OFF_IRQ(x) (1 << (3 + (x))) +#define P_GPHY_IRQ(x) (1 << (4 + (x))) +#define P_NUM_IRQ 5 +#define P_IRQ_MASK(x) (P_LINK_UP_IRQ((x)) | \ + P_LINK_DOWN_IRQ((x)) | \ + P_ENERGY_ON_IRQ((x)) | \ + P_ENERGY_OFF_IRQ((x)) | \ + P_GPHY_IRQ((x))) + +/* INTRL2_0 interrupt sources */ +#define P0_IRQ_OFF 0 +#define MEM_DOUBLE_IRQ (1 << 5) +#define EEE_LPI_IRQ (1 << 6) +#define P5_CPU_WAKE_IRQ (1 << 7) +#define P8_CPU_WAKE_IRQ (1 << 8) +#define P7_CPU_WAKE_IRQ (1 << 9) +#define IEEE1588_IRQ (1 << 10) +#define MDIO_ERR_IRQ (1 << 11) +#define MDIO_DONE_IRQ (1 << 12) +#define GISB_ERR_IRQ (1 << 13) +#define UBUS_ERR_IRQ (1 << 14) +#define FAILOVER_ON_IRQ (1 << 15) +#define FAILOVER_OFF_IRQ (1 << 16) +#define TCAM_SOFT_ERR_IRQ (1 << 17) + +/* INTRL2_1 interrupt sources */ +#define P7_IRQ_OFF 0 +#define P_IRQ_OFF(x) ((6 - (x)) * P_NUM_IRQ) + +/* Register set relative to 'CORE' */ +#define CORE_G_PCTL_PORT0 0x00000 +#define CORE_G_PCTL_PORT(x) (CORE_G_PCTL_PORT0 + (x * 0x4)) +#define CORE_IMP_CTL 0x00020 +#define RX_DIS (1 << 0) +#define TX_DIS (1 << 1) +#define RX_BCST_EN (1 << 2) +#define RX_MCST_EN (1 << 3) +#define RX_UCST_EN (1 << 4) +#define G_MISTP_STATE_SHIFT 5 +#define G_MISTP_NO_STP (0 << G_MISTP_STATE_SHIFT) +#define G_MISTP_DIS_STATE (1 << G_MISTP_STATE_SHIFT) +#define G_MISTP_BLOCK_STATE (2 << G_MISTP_STATE_SHIFT) +#define G_MISTP_LISTEN_STATE (3 << G_MISTP_STATE_SHIFT) +#define G_MISTP_LEARN_STATE (4 << G_MISTP_STATE_SHIFT) +#define G_MISTP_FWD_STATE (5 << G_MISTP_STATE_SHIFT) +#define G_MISTP_STATE_MASK 0x7 + +#define CORE_SWMODE 0x0002c +#define SW_FWDG_MODE (1 << 0) +#define SW_FWDG_EN (1 << 1) +#define RTRY_LMT_DIS (1 << 2) + +#define CORE_STS_OVERRIDE_IMP 0x00038 +#define GMII_SPEED_UP_2G (1 << 6) +#define MII_SW_OR (1 << 7) + +#define CORE_NEW_CTRL 0x00084 +#define IP_MC (1 << 0) +#define OUTRANGEERR_DISCARD (1 << 1) +#define INRANGEERR_DISCARD (1 << 2) +#define CABLE_DIAG_LEN (1 << 3) +#define OVERRIDE_AUTO_PD_WAR (1 << 4) +#define EN_AUTO_PD_WAR (1 << 5) +#define UC_FWD_EN (1 << 6) +#define MC_FWD_EN (1 << 7) + +#define CORE_SWITCH_CTRL 0x00088 +#define MII_DUMB_FWDG_EN (1 << 6) + +#define CORE_SFT_LRN_CTRL 0x000f8 +#define SW_LEARN_CNTL(x) (1 << (x)) + +#define CORE_STS_OVERRIDE_GMIIP_PORT(x) (0x160 + (x) * 4) +#define LINK_STS (1 << 0) +#define DUPLX_MODE (1 << 1) +#define SPEED_SHIFT 2 +#define SPEED_MASK 0x3 +#define RXFLOW_CNTL (1 << 4) +#define TXFLOW_CNTL (1 << 5) +#define SW_OVERRIDE (1 << 6) + +#define CORE_WATCHDOG_CTRL 0x001e4 +#define SOFTWARE_RESET (1 << 7) +#define EN_CHIP_RST (1 << 6) +#define EN_SW_RESET (1 << 4) + +#define CORE_LNKSTS 0x00400 +#define LNK_STS_MASK 0x1ff + +#define CORE_SPDSTS 0x00410 +#define SPDSTS_10 0 +#define SPDSTS_100 1 +#define SPDSTS_1000 2 +#define SPDSTS_SHIFT 2 +#define SPDSTS_MASK 0x3 + +#define CORE_DUPSTS 0x00420 +#define CORE_DUPSTS_MASK 0x1ff + +#define CORE_PAUSESTS 0x00428 +#define PAUSESTS_TX_PAUSE_SHIFT 9 + +#define CORE_GMNCFGCFG 0x0800 +#define RST_MIB_CNT (1 << 0) +#define RXBPDU_EN (1 << 1) + +#define CORE_IMP0_PRT_ID 0x0804 + +#define CORE_BRCM_HDR_CTRL 0x0080c +#define BRCM_HDR_EN_P8 (1 << 0) +#define BRCM_HDR_EN_P5 (1 << 1) +#define BRCM_HDR_EN_P7 (1 << 2) + +#define CORE_BRCM_HDR_CTRL2 0x0828 + +#define CORE_HL_PRTC_CTRL 0x0940 +#define ARP_EN (1 << 0) +#define RARP_EN (1 << 1) +#define DHCP_EN (1 << 2) +#define ICMPV4_EN (1 << 3) +#define ICMPV6_EN (1 << 4) +#define ICMPV6_FWD_MODE (1 << 5) +#define IGMP_DIP_EN (1 << 8) +#define IGMP_RPTLVE_EN (1 << 9) +#define IGMP_RTPLVE_FWD_MODE (1 << 10) +#define IGMP_QRY_EN (1 << 11) +#define IGMP_QRY_FWD_MODE (1 << 12) +#define IGMP_UKN_EN (1 << 13) +#define IGMP_UKN_FWD_MODE (1 << 14) +#define MLD_RPTDONE_EN (1 << 15) +#define MLD_RPTDONE_FWD_MODE (1 << 16) +#define MLD_QRY_EN (1 << 17) +#define MLD_QRY_FWD_MODE (1 << 18) + +#define CORE_RST_MIB_CNT_EN 0x0950 + +#define CORE_BRCM_HDR_RX_DIS 0x0980 +#define CORE_BRCM_HDR_TX_DIS 0x0988 + +#define CORE_MEM_PSM_VDD_CTRL 0x2380 +#define P_TXQ_PSM_VDD_SHIFT 2 +#define P_TXQ_PSM_VDD_MASK 0x3 +#define P_TXQ_PSM_VDD(x) (P_TXQ_PSM_VDD_MASK << \ + ((x) * P_TXQ_PSM_VDD_SHIFT)) + +#define CORE_P0_MIB_OFFSET 0x8000 +#define P_MIB_SIZE 0x400 +#define CORE_P_MIB_OFFSET(x) (CORE_P0_MIB_OFFSET + (x) * P_MIB_SIZE) + +#define CORE_PORT_VLAN_CTL_PORT(x) (0xc400 + ((x) * 0x8)) +#define PORT_VLAN_CTRL_MASK 0x1ff + +#endif /* __BCM_SF2_REGS_H */ diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 4dc2a16b72cf..484f695351a7 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -635,6 +635,7 @@ struct packet_type dsa_pack_type __read_mostly = { }; static const struct of_device_id dsa_of_match_table[] = { + { .compatible = "brcm,bcm7445-switch-v4.0" }, { .compatible = "marvell,dsa", }, {} }; -- cgit v1.2.3 From 0244790c8ad2408dfb313e5c886e6e5a808ea946 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Fri, 29 Aug 2014 17:09:07 +0800 Subject: xfrm: remove useless hash_resize_mutex locks In xfrm_state.c, hash_resize_mutex is defined as a local variable and only used in xfrm_hash_resize() which is declared as a work handler of xfrm.state_hash_work. But when the xfrm.state_hash_work work is put in the global workqueue(system_wq) with schedule_work(), the work will be really inserted in the global workqueue if it was not already queued, otherwise, it is still left in the same position on the the global workqueue. This means the xfrm_hash_resize() work handler is only executed once at any time no matter how many times its work is scheduled, that is, xfrm_hash_resize() is not called concurrently at all, so hash_resize_mutex is redundant for us. Cc: Christophe Gouault Cc: Steffen Klassert Signed-off-by: Ying Xue Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0ab54134bb40..de971b6d38c5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -97,8 +97,6 @@ static unsigned long xfrm_hash_new_size(unsigned int state_hmask) return ((state_hmask + 1) << 1) * sizeof(struct hlist_head); } -static DEFINE_MUTEX(hash_resize_mutex); - static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_hash_work); @@ -107,22 +105,20 @@ static void xfrm_hash_resize(struct work_struct *work) unsigned int nhashmask, ohashmask; int i; - mutex_lock(&hash_resize_mutex); - nsize = xfrm_hash_new_size(net->xfrm.state_hmask); ndst = xfrm_hash_alloc(nsize); if (!ndst) - goto out_unlock; + return; nsrc = xfrm_hash_alloc(nsize); if (!nsrc) { xfrm_hash_free(ndst, nsize); - goto out_unlock; + return; } nspi = xfrm_hash_alloc(nsize); if (!nspi) { xfrm_hash_free(ndst, nsize); xfrm_hash_free(nsrc, nsize); - goto out_unlock; + return; } spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -148,9 +144,6 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); - -out_unlock: - mutex_unlock(&hash_resize_mutex); } static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); -- cgit v1.2.3 From a00f4f6e048dff90e64c6d1bde2bb4587c6d2234 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Mon, 28 Jul 2014 15:16:59 +0200 Subject: mac80211: fix chantype recalc warning When a device driver is unloaded local->interfaces list is cleared. If there was more than 1 interface running and connected (bound to a chanctx) then chantype recalc was called and it ended up with compat being NULL causing a call trace warning. Warn if compat becomes NULL as a result of incompatible bss_conf.chandef of interfaces bound to a given channel context only. The call trace looked like this: WARNING: CPU: 2 PID: 2594 at /devel/src/linux/net/mac80211/chan.c:557 ieee80211_recalc_chanctx_chantype+0x2cd/0x2e0() Modules linked in: ath10k_pci(-) ath10k_core ath CPU: 2 PID: 2594 Comm: rmmod Tainted: G W 3.16.0-rc1+ #150 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 0000000000000009 ffff88001ea279c0 ffffffff818dfa93 0000000000000000 ffff88001ea279f8 ffffffff810514a8 ffff88001ce09cd0 ffff88001e03cc58 0000000000000000 ffff88001ce08840 ffff88001ce09cd0 ffff88001ea27a08 Call Trace: [] dump_stack+0x4d/0x66 [] warn_slowpath_common+0x78/0xa0 [] warn_slowpath_null+0x15/0x20 [] ieee80211_recalc_chanctx_chantype+0x2cd/0x2e0 [] ? ieee80211_recalc_chanctx_chantype+0x2a/0x2e0 [] ieee80211_assign_vif_chanctx+0x1a9/0x770 [] __ieee80211_vif_release_channel+0x70/0x130 [] ieee80211_vif_release_channel+0x43/0xb0 [] ieee80211_stop_ap+0x21e/0x5a0 [] __cfg80211_stop_ap+0x85/0x520 [] __cfg80211_leave+0x68/0x120 [] cfg80211_leave+0x28/0x40 [] cfg80211_netdev_notifier_call+0x373/0x6b0 [] notifier_call_chain+0x55/0x110 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers_info+0x30/0x60 [] __dev_close_many+0x59/0xf0 [] dev_close_many+0x81/0x120 [] rollback_registered_many+0x115/0x2a0 [] unregister_netdevice_many+0x16/0xa0 [] ieee80211_remove_interfaces+0x121/0x1b0 [] ieee80211_unregister_hw+0x56/0x110 [] ath10k_mac_unregister+0x14/0x60 [ath10k_core] [] ath10k_core_unregister+0x27/0x40 [ath10k_core] [] ath10k_pci_remove+0x44/0xa0 [ath10k_pci] [] pci_device_remove+0x28/0x60 [] __device_release_driver+0x64/0xd0 [] driver_detach+0xb8/0xc0 [] bus_remove_driver+0x4a/0xb0 [] driver_unregister+0x27/0x50 Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 792eac6cc7b3..bd01a9f041bd 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -547,12 +547,12 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, compat = cfg80211_chandef_compatible( &sdata->vif.bss_conf.chandef, compat); - if (!compat) + if (WARN_ON_ONCE(!compat)) break; } rcu_read_unlock(); - if (WARN_ON_ONCE(!compat)) + if (!compat) return; ieee80211_change_chanctx(local, ctx, compat); -- cgit v1.2.3 From d0616613d9cf17919fbd46fa0274db4b0084ad62 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 19 Aug 2014 15:41:32 +0300 Subject: net: rfkill: gpio: Add more Broadcom bluetooth ACPI IDs This adds one more ACPI ID of a Broadcom bluetooth chip. Signed-off-by: Mika Westerberg Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 14c98e48f261..02a86a27fd84 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -158,6 +158,7 @@ static const struct acpi_device_id rfkill_acpi_match[] = { { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, { "BCM2E39", RFKILL_TYPE_BLUETOOTH }, { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, + { "BCM2E64", RFKILL_TYPE_BLUETOOTH }, { "BCM4752", RFKILL_TYPE_GPS }, { "LNV4752", RFKILL_TYPE_GPS }, { }, -- cgit v1.2.3 From 10c51b56232d24f150e39884a9e749fd99cbc60c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 27 Aug 2014 11:11:27 +0200 Subject: net: add skb_get_tx_queue() helper Replace occurences of skb_get_queue_mapping() and follow-up netdev_get_tx_queue() with an actual helper function. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ net/core/netpoll.c | 2 +- net/core/pktgen.c | 4 +--- net/packet/af_packet.c | 4 +--- net/sched/sch_generic.c | 6 ++++-- 5 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 429801370d0c..dfc1d8b8bd0f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1747,6 +1747,12 @@ struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, return &dev->_tx[index]; } +static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev, + const struct sk_buff *skb) +{ + return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); +} + static inline void netdev_for_each_tx_queue(struct net_device *dev, void (*f)(struct net_device *, struct netdev_queue *, diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a5ad06828d67..12b1df976562 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -115,7 +115,7 @@ static void queue_process(struct work_struct *work) continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); local_irq_save(flags); HARD_TX_LOCK(dev, txq, smp_processor_id()); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 83e2b4b19eb7..d81b540096c3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3286,7 +3286,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; - u16 queue_map; int ret; /* If device is offline, then don't send */ @@ -3324,8 +3323,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0dfa990d4eaa..b7a7f5a721bd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -243,7 +243,6 @@ static int packet_direct_xmit(struct sk_buff *skb) netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; - u16 queue_map; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) @@ -254,8 +253,7 @@ static int packet_direct_xmit(struct sk_buff *skb) __skb_linearize(skb)) goto drop; - queue_map = skb_get_queue_mapping(skb); - txq = netdev_get_tx_queue(dev, queue_map); + txq = skb_get_tx_queue(dev, skb); local_bh_disable(); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc04fe93c2da..05b3f5d104af 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -63,7 +63,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; @@ -183,10 +183,12 @@ static inline int qdisc_restart(struct Qdisc *q) skb = dequeue_skb(q); if (unlikely(!skb)) return 0; + WARN_ON_ONCE(skb_dst_is_noref(skb)); + root_lock = qdisc_lock(q); dev = qdisc_dev(q); - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock); } -- cgit v1.2.3 From cc086fcf92996965f0dcf05c6641d65381705266 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 28 Aug 2014 10:02:41 +0800 Subject: tipc: fix a potential oops Commit 6c9808ce09f7 ("tipc: remove port_lock") accidentally involves a potential bug: when tipc socket instance(tsk) is not got with given reference number in tipc_sk_get(), tsk is set to NULL. Subsequently we jump to exit label where to decrease socket reference counter pointed by tsk pointer in tipc_sk_put(). However, As now tsk is NULL, oops may happen because of touching a NULL pointer. Signed-off-by: Ying Xue Acked-by: Erik Hugne Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d416e83ce069..75275c5cf929 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2118,9 +2118,9 @@ static void tipc_sk_timeout(unsigned long ref) tsk = tipc_sk_get(ref); if (!tsk) - goto exit; - sk = &tsk->sk; + return; + sk = &tsk->sk; bh_lock_sock(sk); if (!tsk->connected) { bh_unlock_sock(sk); -- cgit v1.2.3 From 77cffe23c1f88835f6bd7b47bfa0c060c2969828 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:46 -0700 Subject: net: Clarification of CHECKSUM_UNNECESSARY This patch: - Clarifies the specific requirements of devices returning CHECKSUM_UNNECESSARY (comments in skbuff.h). - Adds csum_level field to skbuff. This is used to express how many checksums are covered by CHECKSUM_UNNECESSARY (stores n - 1). This replaces the overloading of skb->encapsulation, that field is is now only used to indicate inner headers are valid. - Change __skb_checksum_validate_needed to "consume" each checksum as indicated by csum_level as layers of the the packet are parsed. - Remove skb_pop_rcv_encapsulation, no longer needed in the new csum_level model. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 -- include/linux/skbuff.h | 74 ++++++++++++++++++++++++++++++++++---------------- net/ipv4/gre_demux.c | 1 - 3 files changed, 51 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index beb377b2d4b7..67527f3d3be2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1158,8 +1158,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - skb_pop_rcv_encapsulation(skb); - vs->rcv(vs, skb, vxh->vx_vni); return 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3c9574c80933..c93b5859a772 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -47,11 +47,29 @@ * * The hardware you're dealing with doesn't calculate the full checksum * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums - * for specific protocols e.g. TCP/UDP/SCTP, then, for such packets it will - * set CHECKSUM_UNNECESSARY if their checksums are okay. skb->csum is still - * undefined in this case though. It is a bad option, but, unfortunately, - * nowadays most vendors do this. Apparently with the secret goal to sell - * you new devices, when you will add new protocol to your host, f.e. IPv6 8) + * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY + * if their checksums are okay. skb->csum is still undefined in this case + * though. It is a bad option, but, unfortunately, nowadays most vendors do + * this. Apparently with the secret goal to sell you new devices, when you + * will add new protocol to your host, f.e. IPv6 8) + * + * CHECKSUM_UNNECESSARY is applicable to following protocols: + * TCP: IPv6 and IPv4. + * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a + * zero UDP checksum for either IPv4 or IPv6, the networking stack + * may perform further validation in this case. + * GRE: only if the checksum is present in the header. + * SCTP: indicates the CRC in SCTP header has been validated. + * + * skb->csum_level indicates the number of consecutive checksums found in + * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. + * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet + * and a device is able to verify the checksums for UDP (possibly zero), + * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to + * two. If the device were only able to verify the UDP checksum and not + * GRE, either because it doesn't support GRE checksum of because GRE + * checksum is bad, skb->csum_level would be set to zero (TCP checksum is + * not considered in this case). * * CHECKSUM_COMPLETE: * @@ -112,6 +130,9 @@ #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 +/* Maximum value in skb->csum_level */ +#define SKB_MAX_CSUM_LEVEL 3 + #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) @@ -571,11 +592,7 @@ struct sk_buff { __u8 wifi_acked:1; __u8 no_fcs:1; __u8 head_frag:1; - /* Encapsulation protocol and NIC drivers should use - * this flag to indicate to each other if the skb contains - * encapsulated packet or not and maybe use the inner packet - * headers if needed - */ + /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; @@ -599,7 +616,8 @@ struct sk_buff { }; kmemcheck_bitfield_begin(flags3); - /* 16 bit hole */ + __u8 csum_level:2; + /* 14 bit hole */ kmemcheck_bitfield_end(flags3); __be16 inner_protocol; @@ -1866,18 +1884,6 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) return pskb_may_pull(skb, skb_network_offset(skb) + len); } -static inline void skb_pop_rcv_encapsulation(struct sk_buff *skb) -{ - /* Only continue with checksum unnecessary if device indicated - * it is valid across encapsulation (skb->encapsulation was set). - */ - if (skb->ip_summed == CHECKSUM_UNNECESSARY && !skb->encapsulation) - skb->ip_summed = CHECKSUM_NONE; - - skb->encapsulation = 0; - skb->csum_valid = 0; -} - /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the @@ -2798,6 +2804,27 @@ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) 0 : __skb_checksum_complete(skb); } +static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level == 0) + skb->ip_summed = CHECKSUM_NONE; + else + skb->csum_level--; + } +} + +static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else if (skb->ip_summed == CHECKSUM_NONE) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -2809,6 +2836,7 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; + __skb_decr_checksum_unnecessary(skb); return false; } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7c1a8ff974dd..0485bf7f8f03 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -125,7 +125,6 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, *csum_err = true; return -EINVAL; } - skb_pop_rcv_encapsulation(skb); options++; } -- cgit v1.2.3 From 662880f4420340aad4f9a62a349c6c9d4faa1a5d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:56 -0700 Subject: net: Allow GRO to use and set levels of checksum unnecessary Allow GRO path to "consume" checksums provided in CHECKSUM_UNNECESSARY and to report new checksums verfied for use in fallback to normal path. Change GRO checksum path to track csum_level using a csum_cnt field in NAPI_GRO_CB. On GRO initialization, if ip_summed is CHECKSUM_UNNECESSARY set NAPI_GRO_CB(skb)->csum_cnt to skb->csum_level + 1. For each checksum verified, decrement NAPI_GRO_CB(skb)->csum_cnt while its greater than zero. If a checksum is verfied and NAPI_GRO_CB(skb)->csum_cnt == 0, we have verified a deeper checksum than originally indicated in skbuf so increment csum_level (or initialize to CHECKSUM_UNNECESSARY if ip_summed is CHECKSUM_NONE or CHECKSUM_COMPLETE). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 26 ++++++++++++-------------- net/core/dev.c | 24 ++++++++++++++++-------- net/ipv4/gre_offload.c | 7 ++----- net/ipv4/udp_offload.c | 5 +++-- 4 files changed, 33 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dfc1d8b8bd0f..456eb1fe51e8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1883,8 +1883,8 @@ struct napi_gro_cb { /* GRO checksum is valid */ u8 csum_valid:1; - /* Number encapsulation layers crossed */ - u8 encapsulation; + /* Number of checksums via CHECKSUM_UNNECESSARY */ + u8 csum_cnt:3; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2179,8 +2179,7 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, __sum16 check) { return (skb->ip_summed != CHECKSUM_PARTIAL && - (skb->ip_summed != CHECKSUM_UNNECESSARY || - (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) && + NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } @@ -2196,18 +2195,17 @@ static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, return __skb_gro_checksum_complete(skb); } -/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level - * checksum or an encapsulated one during GRO. This saves work - * if we fallback to normal path with the packet. - */ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (NAPI_GRO_CB(skb)->encapsulation) - skb->encapsulation = 1; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->encapsulation = 0; + if (NAPI_GRO_CB(skb)->csum_cnt > 0) { + /* Consume a checksum from CHECKSUM_UNNECESSARY */ + NAPI_GRO_CB(skb)->csum_cnt--; + } else { + /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we + * verified a new top level checksum or an encapsulated one + * during GRO. This saves work if we fallback to normal path. + */ + __skb_incr_checksum_unnecessary(skb); } } diff --git a/net/core/dev.c b/net/core/dev.c index 26d296c2447c..a6077ef56345 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3962,13 +3962,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff gro_list_prepare(napi, skb); - if (skb->ip_summed == CHECKSUM_COMPLETE) { - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - } else { - NAPI_GRO_CB(skb)->csum_valid = 0; - } - rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) @@ -3980,7 +3973,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; - NAPI_GRO_CB(skb)->encapsulation = 0; + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d1bd16937d93..a4d7965fb880 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -172,12 +172,9 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, } /* Don't bother verifying checksum if we're going to flush anyway. */ - if (greh->flags & GRE_CSUM) { - if (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_simple_validate(skb)) + if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_simple_validate(skb)) goto out_unlock; - NAPI_GRO_CB(skb)->encapsulation++; - } flush = 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8ed460e3753c..a6adff98382a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -238,12 +238,13 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, int flush = 1; if (NAPI_GRO_CB(skb)->udp_mark || - (!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid)) + (skb->ip_summed != CHECKSUM_PARTIAL && + NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the udp gro layer */ NAPI_GRO_CB(skb)->udp_mark = 1; - NAPI_GRO_CB(skb)->encapsulation++; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); -- cgit v1.2.3 From 202863fe4c7a5b0b9a3d3a00d207691635b31930 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:27:06 -0700 Subject: sctp: Change sctp to implement csum_levels CHECKSUM_UNNECESSARY may be applied to the SCTP CRC so we need to appropriate account for this by decrementing csum_level. This is done by calling __skb_dec_checksum_unnecessary. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sctp/input.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index c1b991294516..b6493b3f11a9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -133,9 +133,13 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) && - sctp_rcv_checksum(net, skb) < 0) + + skb->csum_valid = 0; /* Previous value not applicable */ + if (skb_csum_unnecessary(skb)) + __skb_decr_checksum_unnecessary(skb); + else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0) goto discard_it; + skb->csum_valid = 1; skb_pull(skb, sizeof(struct sctphdr)); -- cgit v1.2.3 From dddb3da046a4d86de649ba795726afa7fe6fbb41 Mon Sep 17 00:00:00 2001 From: "Mark A. Greer" Date: Tue, 22 Jul 2014 20:18:01 -0700 Subject: NFC: digital: Add Inititor-side PSL support In order to operate at the fasted bit rate possible, add initiator-side support for PSL REQ while in P2P mode. The PSL REQ will switch the RF technology to 424F whenever possible. Reviewed-by: Thierry Escande Tested-by: Thierry Escande Signed-off-by: Mark A. Greer Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index e1638dab076d..b60aa35c074f 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -33,6 +33,8 @@ #define DIGITAL_ATR_REQ_MAX_SIZE 64 #define DIGITAL_LR_BITS_PAYLOAD_SIZE_254B 0x30 +#define DIGITAL_FSL_BITS_PAYLOAD_SIZE_254B \ + (DIGITAL_LR_BITS_PAYLOAD_SIZE_254B >> 4) #define DIGITAL_GB_BIT 0x02 #define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0) @@ -127,6 +129,98 @@ static int digital_skb_pull_dep_sod(struct nfc_digital_dev *ddev, return 0; } +static void digital_in_recv_psl_res(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp) +{ + struct nfc_target *target = arg; + struct digital_psl_res *psl_res; + int rc; + + if (IS_ERR(resp)) { + rc = PTR_ERR(resp); + resp = NULL; + goto exit; + } + + rc = ddev->skb_check_crc(resp); + if (rc) { + PROTOCOL_ERR("14.4.1.6"); + goto exit; + } + + rc = digital_skb_pull_dep_sod(ddev, resp); + if (rc) { + PROTOCOL_ERR("14.4.1.2"); + goto exit; + } + + psl_res = (struct digital_psl_res *)resp->data; + + if ((resp->len != sizeof(*psl_res)) || + (psl_res->dir != DIGITAL_NFC_DEP_FRAME_DIR_IN) || + (psl_res->cmd != DIGITAL_CMD_PSL_RES)) { + rc = -EIO; + goto exit; + } + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, + NFC_DIGITAL_RF_TECH_424F); + if (rc) + goto exit; + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCF_NFC_DEP); + if (rc) + goto exit; + + if (!DIGITAL_DRV_CAPS_IN_CRC(ddev) && + (ddev->curr_rf_tech == NFC_DIGITAL_RF_TECH_106A)) { + ddev->skb_add_crc = digital_skb_add_crc_f; + ddev->skb_check_crc = digital_skb_check_crc_f; + } + + ddev->curr_rf_tech = NFC_DIGITAL_RF_TECH_424F; + + nfc_dep_link_is_up(ddev->nfc_dev, target->idx, NFC_COMM_ACTIVE, + NFC_RF_INITIATOR); + + ddev->curr_nfc_dep_pni = 0; + +exit: + dev_kfree_skb(resp); + + if (rc) + ddev->curr_protocol = 0; +} + +static int digital_in_send_psl_req(struct nfc_digital_dev *ddev, + struct nfc_target *target) +{ + struct sk_buff *skb; + struct digital_psl_req *psl_req; + + skb = digital_skb_alloc(ddev, sizeof(*psl_req)); + if (!skb) + return -ENOMEM; + + skb_put(skb, sizeof(*psl_req)); + + psl_req = (struct digital_psl_req *)skb->data; + + psl_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT; + psl_req->cmd = DIGITAL_CMD_PSL_REQ; + psl_req->did = 0; + psl_req->brs = (0x2 << 3) | 0x2; /* 424F both directions */ + psl_req->fsl = DIGITAL_FSL_BITS_PAYLOAD_SIZE_254B; + + digital_skb_push_dep_sod(ddev, skb); + + ddev->skb_add_crc(skb); + + return digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res, + target); +} + static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, struct sk_buff *resp) { @@ -166,6 +260,13 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, if (rc) goto exit; + if ((ddev->protocols & NFC_PROTO_FELICA_MASK) && + (ddev->curr_rf_tech != NFC_DIGITAL_RF_TECH_424F)) { + rc = digital_in_send_psl_req(ddev, target); + if (!rc) + goto exit; + } + rc = nfc_dep_link_is_up(ddev->nfc_dev, target->idx, NFC_COMM_ACTIVE, NFC_RF_INITIATOR); -- cgit v1.2.3 From cfdbeeafdbbdbc006f700e92cbad2cb5d4529f3d Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Tue, 22 Jul 2014 19:48:38 +0200 Subject: NFC: NCI: Add support of ISO15693 Update nci.h to respect latest NCI specification proposal (stop using proprietary opcodes). Handle ISO15693 parameters in NCI_RF_ACTIVATED_NTF handler. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- include/net/nfc/nci.h | 16 +++++++++++++--- net/nfc/nci/core.c | 8 ++++++++ net/nfc/nci/ntf.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index fbfa4e471abb..9eca9ae2280c 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -2,6 +2,7 @@ * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * + * Copyright (C) 2014 Marvell International Ltd. * Copyright (C) 2011 Texas Instruments, Inc. * * Written by Ilan Elias @@ -65,19 +66,18 @@ #define NCI_NFC_F_PASSIVE_POLL_MODE 0x02 #define NCI_NFC_A_ACTIVE_POLL_MODE 0x03 #define NCI_NFC_F_ACTIVE_POLL_MODE 0x05 -#define NCI_NFC_15693_PASSIVE_POLL_MODE 0x06 +#define NCI_NFC_V_PASSIVE_POLL_MODE 0x06 #define NCI_NFC_A_PASSIVE_LISTEN_MODE 0x80 #define NCI_NFC_B_PASSIVE_LISTEN_MODE 0x81 #define NCI_NFC_F_PASSIVE_LISTEN_MODE 0x82 #define NCI_NFC_A_ACTIVE_LISTEN_MODE 0x83 #define NCI_NFC_F_ACTIVE_LISTEN_MODE 0x85 -#define NCI_NFC_15693_PASSIVE_LISTEN_MODE 0x86 /* NCI RF Technologies */ #define NCI_NFC_RF_TECHNOLOGY_A 0x00 #define NCI_NFC_RF_TECHNOLOGY_B 0x01 #define NCI_NFC_RF_TECHNOLOGY_F 0x02 -#define NCI_NFC_RF_TECHNOLOGY_15693 0x03 +#define NCI_NFC_RF_TECHNOLOGY_V 0x03 /* NCI Bit Rates */ #define NCI_NFC_BIT_RATE_106 0x00 @@ -87,6 +87,7 @@ #define NCI_NFC_BIT_RATE_1695 0x04 #define NCI_NFC_BIT_RATE_3390 0x05 #define NCI_NFC_BIT_RATE_6780 0x06 +#define NCI_NFC_BIT_RATE_26 0x20 /* NCI RF Protocols */ #define NCI_RF_PROTOCOL_UNKNOWN 0x00 @@ -95,6 +96,7 @@ #define NCI_RF_PROTOCOL_T3T 0x03 #define NCI_RF_PROTOCOL_ISO_DEP 0x04 #define NCI_RF_PROTOCOL_NFC_DEP 0x05 +#define NCI_RF_PROTOCOL_T5T 0x06 /* NCI RF Interfaces */ #define NCI_RF_INTERFACE_NFCEE_DIRECT 0x00 @@ -328,6 +330,12 @@ struct rf_tech_specific_params_nfcf_poll { __u8 sensf_res[18]; /* 16 or 18 Bytes */ } __packed; +struct rf_tech_specific_params_nfcv_poll { + __u8 res_flags; + __u8 dsfid; + __u8 uid[8]; /* 8 Bytes */ +} __packed; + struct nci_rf_discover_ntf { __u8 rf_discovery_id; __u8 rf_protocol; @@ -338,6 +346,7 @@ struct nci_rf_discover_ntf { struct rf_tech_specific_params_nfca_poll nfca_poll; struct rf_tech_specific_params_nfcb_poll nfcb_poll; struct rf_tech_specific_params_nfcf_poll nfcf_poll; + struct rf_tech_specific_params_nfcv_poll nfcv_poll; } rf_tech_specific_params; __u8 ntf_type; @@ -372,6 +381,7 @@ struct nci_rf_intf_activated_ntf { struct rf_tech_specific_params_nfca_poll nfca_poll; struct rf_tech_specific_params_nfcb_poll nfcb_poll; struct rf_tech_specific_params_nfcf_poll nfcf_poll; + struct rf_tech_specific_params_nfcv_poll nfcv_poll; } rf_tech_specific_params; __u8 data_exch_rf_tech_and_mode; diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 2b400e1a8695..860080803a3e 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -231,6 +231,14 @@ static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt) cmd.num_disc_configs++; } + if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && + (protocols & NFC_PROTO_ISO15693_MASK)) { + cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = + NCI_NFC_V_PASSIVE_POLL_MODE; + cmd.disc_configs[cmd.num_disc_configs].frequency = 1; + cmd.num_disc_configs++; + } + nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_CMD, (1 + (cmd.num_disc_configs * sizeof(struct disc_config))), &cmd); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index df91bb95b12a..25e44cebd60a 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -2,6 +2,7 @@ * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * + * Copyright (C) 2014 Marvell International Ltd. * Copyright (C) 2011 Texas Instruments, Inc. * * Written by Ilan Elias @@ -155,6 +156,17 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, return data; } +static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcv_poll *nfcv_poll, + __u8 *data) +{ + ++data; + nfcv_poll->dsfid = *data++; + memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE); + data += NFC_ISO15693_UID_MAXSIZE; + return data; +} + static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, @@ -164,6 +176,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, struct rf_tech_specific_params_nfca_poll *nfca_poll; struct rf_tech_specific_params_nfcb_poll *nfcb_poll; struct rf_tech_specific_params_nfcf_poll *nfcf_poll; + struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; if (rf_protocol == NCI_RF_PROTOCOL_T1T) @@ -179,6 +192,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, protocol = NFC_PROTO_FELICA_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) protocol = NFC_PROTO_NFC_DEP_MASK; + else if (rf_protocol == NCI_RF_PROTOCOL_T5T) + protocol = NFC_PROTO_ISO15693_MASK; else protocol = 0; @@ -213,6 +228,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, memcpy(target->sensf_res, nfcf_poll->sensf_res, target->sensf_res_len); } + } else if (rf_tech_and_mode == NCI_NFC_V_PASSIVE_POLL_MODE) { + nfcv_poll = (struct rf_tech_specific_params_nfcv_poll *)params; + + target->is_iso15693 = 1; + target->iso15693_dsfid = nfcv_poll->dsfid; + memcpy(target->iso15693_uid, nfcv_poll->uid, NFC_ISO15693_UID_MAXSIZE); } else { pr_err("unsupported rf_tech_and_mode 0x%x\n", rf_tech_and_mode); return -EPROTO; @@ -305,6 +326,11 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; + case NCI_NFC_V_PASSIVE_POLL_MODE: + data = nci_extract_rf_params_nfcv_passive_poll(ndev, + &(ntf.rf_tech_specific_params.nfcv_poll), data); + break; + default: pr_err("unsupported rf_tech_and_mode 0x%x\n", ntf.rf_tech_and_mode); @@ -455,6 +481,11 @@ static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; + case NCI_NFC_V_PASSIVE_POLL_MODE: + data = nci_extract_rf_params_nfcv_passive_poll(ndev, + &(ntf.rf_tech_specific_params.nfcv_poll), data); + break; + default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf.activation_rf_tech_and_mode); -- cgit v1.2.3 From 3c1c0f5dc80bbde5baef2403cc6a0d33c9824d2d Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Tue, 22 Jul 2014 19:48:39 +0200 Subject: NFC: NCI: Fix nci_register_device init sequence All contexts have to be initiliazed before calling nfc_register_device otherwise it is possible to call nci_dev_up before ending the nci_register_device function. In such case kernel will crash on non initialized variables. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- net/nfc/nci/core.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 860080803a3e..90b16cb40058 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -759,10 +759,6 @@ int nci_register_device(struct nci_dev *ndev) struct device *dev = &ndev->nfc_dev->dev; char name[32]; - rc = nfc_register_device(ndev->nfc_dev); - if (rc) - goto exit; - ndev->flags = 0; INIT_WORK(&ndev->cmd_work, nci_cmd_work); @@ -770,7 +766,7 @@ int nci_register_device(struct nci_dev *ndev) ndev->cmd_wq = create_singlethread_workqueue(name); if (!ndev->cmd_wq) { rc = -ENOMEM; - goto unreg_exit; + goto exit; } INIT_WORK(&ndev->rx_work, nci_rx_work); @@ -800,6 +796,10 @@ int nci_register_device(struct nci_dev *ndev) mutex_init(&ndev->req_lock); + rc = nfc_register_device(ndev->nfc_dev); + if (rc) + goto destroy_rx_wq_exit; + goto exit; destroy_rx_wq_exit: @@ -808,9 +808,6 @@ destroy_rx_wq_exit: destroy_cmd_wq_exit: destroy_workqueue(ndev->cmd_wq); -unreg_exit: - nfc_unregister_device(ndev->nfc_dev); - exit: return rc; } -- cgit v1.2.3 From 83724c3329c93f9efc7f53498edd4c538e724366 Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Tue, 22 Jul 2014 19:48:40 +0200 Subject: NFC: NCI: Fix NCI RF FRAME interface usage NCI RF FRAME interface is used for all kind of tags except ISODEP ones. So for all other kind of tags the status byte has to be removed. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- net/nfc/nci/data.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 6c3aef852876..427ef2c7ab68 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -241,9 +241,12 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); - if (ndev->target_active_prot == NFC_PROTO_MIFARE) { + if (ndev->target_active_prot == NFC_PROTO_MIFARE || + ndev->target_active_prot == NFC_PROTO_JEWEL || + ndev->target_active_prot == NFC_PROTO_FELICA || + ndev->target_active_prot == NFC_PROTO_ISO15693) { /* frame I/F => remove the status byte */ - pr_debug("NFC_PROTO_MIFARE => remove the status byte\n"); + pr_debug("frame I/F => remove the status byte\n"); skb_trim(skb, (skb->len - 1)); } -- cgit v1.2.3 From 10b3ad8c21bb4b135768c30dd4c51a1c744da699 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:07:24 -0700 Subject: net: Do txq_trans_update() in netdev_start_xmit() That way we don't have to audit every call site to make sure it is doing this properly. Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 6 ++++-- include/linux/netdevice.h | 10 ++++++++-- net/core/dev.c | 7 ++----- net/core/netpoll.c | 4 +--- net/core/pktgen.c | 3 +-- net/packet/af_packet.c | 7 ++----- net/sched/sch_teql.c | 3 +-- 7 files changed, 19 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 81b22a180aad..6427e8283419 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -192,8 +192,10 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) { struct dlci_local *dlp = netdev_priv(dev); - if (skb) - netdev_start_xmit(skb, dlp->slave); + if (skb) { + struct netdev_queue *txq = skb_get_tx_queue(dev, skb); + netdev_start_xmit(skb, dlp->slave, txq); + } return NETDEV_TX_OK; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 456eb1fe51e8..16171802ea7d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3437,11 +3437,17 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, return ops->ndo_start_xmit(skb, dev); } -static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) +static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; - return __netdev_start_xmit(ops, skb, dev); + rc = __netdev_start_xmit(ops, skb, dev); + if (rc == NETDEV_TX_OK) + txq_trans_update(txq); + + return rc; } int netdev_class_create_file_ns(struct class_attribute *class_attr, diff --git a/net/core/dev.c b/net/core/dev.c index a6077ef56345..6392adaaa22f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2666,10 +2666,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq); trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); return rc; } @@ -2685,7 +2683,7 @@ gso: skb_len = nskb->len; trace_net_dev_start_xmit(nskb, dev); - rc = netdev_start_xmit(nskb, dev); + rc = netdev_start_xmit(nskb, dev, txq); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2694,7 +2692,6 @@ gso: skb->next = nskb; return rc; } - txq_trans_update(txq); if (unlikely(netif_xmit_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 12b1df976562..05bc57edaa81 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -91,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = netdev_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); + status = netdev_start_xmit(skb, dev, txq); out: return status; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d81b540096c3..34bd2ff9f121 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3335,11 +3335,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq); switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b7a7f5a721bd..fe305a05a8fc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -258,11 +258,8 @@ static int packet_direct_xmit(struct sk_buff *skb) local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = netdev_start_xmit(skb, dev); - if (ret == NETDEV_TX_OK) - txq_trans_update(txq); - } + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 64cd93ca8104..193dc2cba1ec 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -316,8 +316,7 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - netdev_start_xmit(skb, slave) == NETDEV_TX_OK) { - txq_trans_update(slave_txq); + netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); -- cgit v1.2.3 From 2ea255137555052655c6a646c4e48ea7481494c7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:10:01 -0700 Subject: net: Create xmit_one() helper for dev_hard_start_xmit() Hopefully making the code a bit easier to read and digest. Signed-off-by: David S. Miller --- net/core/dev.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 6392adaaa22f..0fde7d2153db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2599,11 +2599,27 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) } EXPORT_SYMBOL(netif_skb_features); +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + unsigned int len; + int rc; + + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq); + trace_net_dev_xmit(skb, rc, dev, len); + + return rc; +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { int rc = NETDEV_TX_OK; - unsigned int skb_len; if (likely(!skb->next)) { netdev_features_t features; @@ -2661,14 +2677,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - - skb_len = skb->len; - trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev, txq); - trace_net_dev_xmit(skb, rc, dev, skb_len); - return rc; + return xmit_one(skb, dev, txq); } gso: @@ -2678,13 +2687,7 @@ gso: skb->next = nskb->next; nskb->next = NULL; - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); - - skb_len = nskb->len; - trace_net_dev_start_xmit(nskb, dev); - rc = netdev_start_xmit(nskb, dev, txq); - trace_net_dev_xmit(nskb, rc, dev, skb_len); + rc = xmit_one(nskb, dev, txq); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) goto out_kfree_gso_skb; -- cgit v1.2.3 From 7f2e870f2a48a0524a3b03b04fa019311d16a7f7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:19:14 -0700 Subject: net: Move main gso loop out of dev_hard_start_xmit() into helper. There is a slight policy change happening here as well. The previous code would drop the entire rest of the GSO skb if any of them got, for example, a congestion notification. That makes no sense, anything NET_XMIT_MASK and below is something like congestion or policing. And in the congestion case it doesn't even mean the packet was actually dropped. Just continue until dev_xmit_complete() evaluates to false. Signed-off-by: David S. Miller --- net/core/dev.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0fde7d2153db..ab7bb809711e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2616,6 +2616,34 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, return rc; } +static struct sk_buff *xmit_list(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) +{ + struct sk_buff *skb = first; + int rc = NETDEV_TX_OK; + + while (skb) { + struct sk_buff *next = skb->next; + + skb->next = NULL; + rc = xmit_one(skb, dev, txq); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } + + skb = next; + if (netif_xmit_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; + } + } + +out: + *ret = rc; + return skb; +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { @@ -2681,25 +2709,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } gso: - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - - rc = xmit_one(nskb, dev, txq); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; - } - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); - -out_kfree_gso_skb: + skb->next = xmit_list(skb->next, dev, txq, &rc); if (likely(skb->next == NULL)) { skb->destructor = DEV_GSO_CB(skb)->destructor; consume_skb(skb); -- cgit v1.2.3 From fa2dbdc253c2aee2a760c64de454cb62469ec11d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:55:22 -0700 Subject: net: Pass a "more" indication down into netdev_start_xmit() code paths. For now it will always be false. Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 2 +- include/linux/netdevice.h | 9 +++++---- net/atm/mpc.c | 2 +- net/core/dev.c | 2 +- net/core/netpoll.c | 2 +- net/core/pktgen.c | 2 +- net/packet/af_packet.c | 2 +- net/sched/sch_teql.c | 3 ++- 8 files changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 6427e8283419..ae6ecf401189 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -194,7 +194,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) if (skb) { struct netdev_queue *txq = skb_get_tx_queue(dev, skb); - netdev_start_xmit(skb, dlp->slave, txq); + netdev_start_xmit(skb, dlp->slave, txq, false); } return NETDEV_TX_OK; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 16171802ea7d..5050218c5b7f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3431,19 +3431,20 @@ int __init dev_proc_init(void); #endif static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + bool more) { - skb->xmit_more = 0; + skb->xmit_more = more ? 1 : 0; return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) + struct netdev_queue *txq, bool more) { const struct net_device_ops *ops = dev->netdev_ops; int rc; - rc = __netdev_start_xmit(ops, skb, dev); + rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index d662da161e5a..0e982222d425 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return __netdev_start_xmit(mpc->old_ops, skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev, false); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/core/dev.c b/net/core/dev.c index ab7bb809711e..f0ed5a611a97 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2610,7 +2610,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev, txq); + rc = netdev_start_xmit(skb, dev, txq, false); trace_net_dev_xmit(skb, rc, dev, len); return rc; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 05bc57edaa81..e6645b4f330a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -91,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = netdev_start_xmit(skb, dev, txq); + status = netdev_start_xmit(skb, dev, txq, false); out: return status; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 34bd2ff9f121..5b36a9428c59 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3335,7 +3335,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev, txq); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, false); switch (ret) { case NETDEV_TX_OK: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fe305a05a8fc..87d20f48ff06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -259,7 +259,7 @@ static int packet_direct_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) - ret = netdev_start_xmit(skb, dev, txq); + ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 193dc2cba1ec..aaa8d03ed054 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -316,7 +316,8 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) { + netdev_start_xmit(skb, slave, slave_txq, false) == + NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); -- cgit v1.2.3 From 95f6b3dda2a4a052f7dabe9998e4ffac491b7bc2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:57:30 -0700 Subject: net: Have xmit_list() signal more==true when appropriate. Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f0ed5a611a97..6d82194e414b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2600,7 +2600,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) + struct netdev_queue *txq, bool more) { unsigned int len; int rc; @@ -2610,7 +2610,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev, txq, false); + rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; @@ -2626,7 +2626,7 @@ static struct sk_buff *xmit_list(struct sk_buff *first, struct net_device *dev, struct sk_buff *next = skb->next; skb->next = NULL; - rc = xmit_one(skb, dev, txq); + rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; @@ -2705,7 +2705,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } - return xmit_one(skb, dev, txq); + return xmit_one(skb, dev, txq, false); } gso: -- cgit v1.2.3 From eae3f88ee44251bcca3a085f9565257c6f9f9e69 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 15:17:13 -0700 Subject: net: Separate out SKB validation logic from transmit path. dev_hard_start_xmit() does two things, it first validates and canonicalizes the SKB, then it actually sends it. Make a set of helper functions for doing the first part. Signed-off-by: David S. Miller --- net/core/dev.c | 125 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 71 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 6d82194e414b..704a5434f77d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2644,80 +2644,97 @@ out: return skb; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { - int rc = NETDEV_TX_OK; + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (skb) + skb->vlan_tci = 0; + } + return skb; +} - if (likely(!skb->next)) { - netdev_features_t features; +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +{ + netdev_features_t features; - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); + if (skb->next) + return skb; - features = netif_skb_features(skb); + /* If device doesn't need skb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; - skb->vlan_tci = 0; - } + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) + goto out_kfree_skb; + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) goto out_kfree_skb; - - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } } + } + + return skb; + +out_kfree_skb: + kfree_skb(skb); +out_null: + return NULL; +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + int rc = NETDEV_TX_OK; + + skb = validate_xmit_skb(skb, dev); + if (!skb) + return rc; + if (likely(!skb->next)) return xmit_one(skb, dev, txq, false); - } -gso: skb->next = xmit_list(skb->next, dev, txq, &rc); if (likely(skb->next == NULL)) { skb->destructor = DEV_GSO_CB(skb)->destructor; consume_skb(skb); return rc; } -out_kfree_skb: + kfree_skb(skb); -out: + return rc; } EXPORT_SYMBOL_GPL(dev_hard_start_xmit); -- cgit v1.2.3 From 50cbe9ab5f8d92d2d4a327b56e96559d8f63a1fa Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 19:13:51 -0700 Subject: net: Validate xmit SKBs right when we pull them out of the qdisc. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 6 +----- net/sched/sch_generic.c | 5 ++++- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5050218c5b7f..47c49ba2dcf4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2827,6 +2827,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 704a5434f77d..75bc5b068a13 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2656,7 +2656,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur return skb; } -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2719,10 +2719,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, { int rc = NETDEV_TX_OK; - skb = validate_xmit_skb(skb, dev); - if (!skb) - return rc; - if (likely(!skb->next)) return xmit_one(skb, dev, txq, false); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 05b3f5d104af..f178798a5836 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -70,8 +70,11 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) } else skb = NULL; } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { skb = q->dequeue(q); + if (skb) + skb = validate_xmit_skb(skb, qdisc_dev(q)); + } } return skb; -- cgit v1.2.3 From ce93718fb7cdbc064c3000ff59e4d3200bdfa744 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 19:22:20 -0700 Subject: net: Don't keep around original SKB when we software segment GSO frames. Just maintain the list properly by returning the head of the remaining SKB list from dev_hard_start_xmit(). Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +-- net/core/dev.c | 79 +++++++++-------------------------------------- net/sched/sch_generic.c | 2 +- 3 files changed, 17 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47c49ba2dcf4..202c25a9aadf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2828,8 +2828,8 @@ int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq); +struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, int *ret); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 75bc5b068a13..c89da4f306b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2485,52 +2485,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - kfree_skb_list(skb->next); - skb->next = NULL; - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ @@ -2682,8 +2636,13 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) features &= dev->hw_enc_features; if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + kfree_skb(skb); + if (IS_ERR(segs)) + segs = NULL; + skb = segs; } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) @@ -2714,26 +2673,16 @@ out_null: return NULL; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, int *ret) { - int rc = NETDEV_TX_OK; - - if (likely(!skb->next)) - return xmit_one(skb, dev, txq, false); - - skb->next = xmit_list(skb->next, dev, txq, &rc); - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + if (likely(!skb->next)) { + *ret = xmit_one(skb, dev, txq, false); + return skb; } - kfree_skb(skb); - - return rc; + return xmit_list(skb, dev, txq, ret); } -EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2945,7 +2894,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f178798a5836..a8bf9f9928bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -129,7 +129,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); -- cgit v1.2.3 From 8dcda22a5d0abaf347b21b057655f3809b91639d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Sep 2014 15:06:40 -0700 Subject: net: xmit_list() becomes dev_hard_start_xmit(). Now fundamentally we can process lists of SKBs as cheaply as single packets. Signed-off-by: David S. Miller --- net/core/dev.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c89da4f306b1..6857d57aa294 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2570,8 +2570,8 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, return rc; } -static struct sk_buff *xmit_list(struct sk_buff *first, struct net_device *dev, - struct netdev_queue *txq, int *ret) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; @@ -2673,17 +2673,6 @@ out_null: return NULL; } -struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq, int *ret) -{ - if (likely(!skb->next)) { - *ret = xmit_one(skb, dev, txq, false); - return skb; - } - - return xmit_list(skb, dev, txq, ret); -} - static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); -- cgit v1.2.3 From f4ad8a4b8b9f490a15c3239e0d6ac99e7e438d34 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 28 Aug 2014 09:08:46 +0200 Subject: tipc: refactor name table updates out of named packet receive routine We need to perform the same actions when processing deferred name table updates, so this functionality is moved to a separate function. Signed-off-by: Erik Hugne Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 74 ++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index dcc15bcd5692..0591f33b8384 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -262,53 +262,55 @@ static void named_purge_publ(struct publication *publ) kfree(p); } +/** + * tipc_update_nametbl - try to process a nametable update and notify + * subscribers + * + * tipc_nametbl_lock must be held. + * Returns the publication item if successful, otherwise NULL. + */ +struct publication *tipc_update_nametbl(struct distr_item *i, u32 node, + u32 dtype) +{ + struct publication *publ = NULL; + + if (dtype == PUBLICATION) { + publ = tipc_nametbl_insert_publ(ntohl(i->type), ntohl(i->lower), + ntohl(i->upper), + TIPC_CLUSTER_SCOPE, node, + ntohl(i->ref), ntohl(i->key)); + if (publ) { + tipc_nodesub_subscribe(&publ->subscr, node, publ, + (net_ev_handler) + named_purge_publ); + } + } else if (dtype == WITHDRAWAL) { + publ = tipc_nametbl_remove_publ(ntohl(i->type), ntohl(i->lower), + node, ntohl(i->ref), + ntohl(i->key)); + if (publ) { + tipc_nodesub_unsubscribe(&publ->subscr); + kfree(publ); + } + } else { + pr_warn("Unrecognized name table message received\n"); + } + return publ; +} + /** * tipc_named_rcv - process name table update message sent by another node */ void tipc_named_rcv(struct sk_buff *buf) { - struct publication *publ; struct tipc_msg *msg = buf_msg(buf); struct distr_item *item = (struct distr_item *)msg_data(msg); u32 count = msg_data_sz(msg) / ITEM_SIZE; write_lock_bh(&tipc_nametbl_lock); while (count--) { - if (msg_type(msg) == PUBLICATION) { - publ = tipc_nametbl_insert_publ(ntohl(item->type), - ntohl(item->lower), - ntohl(item->upper), - TIPC_CLUSTER_SCOPE, - msg_orignode(msg), - ntohl(item->ref), - ntohl(item->key)); - if (publ) { - tipc_nodesub_subscribe(&publ->subscr, - msg_orignode(msg), - publ, - (net_ev_handler) - named_purge_publ); - } - } else if (msg_type(msg) == WITHDRAWAL) { - publ = tipc_nametbl_remove_publ(ntohl(item->type), - ntohl(item->lower), - msg_orignode(msg), - ntohl(item->ref), - ntohl(item->key)); - - if (publ) { - tipc_nodesub_unsubscribe(&publ->subscr); - kfree(publ); - } else { - pr_err("Unable to remove publication by node 0x%x\n" - " (type=%u, lower=%u, ref=%u, key=%u)\n", - msg_orignode(msg), ntohl(item->type), - ntohl(item->lower), ntohl(item->ref), - ntohl(item->key)); - } - } else { - pr_warn("Unrecognized name table message received\n"); - } + tipc_update_nametbl(item, msg_orignode(msg), + msg_type(msg)); item++; } write_unlock_bh(&tipc_nametbl_lock); -- cgit v1.2.3 From a5325ae5b8bff051933a754db7727fc9823e6414 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 28 Aug 2014 09:08:47 +0200 Subject: tipc: add name distributor resiliency queue TIPC name table updates are distributed asynchronously in a cluster, entailing a risk of certain race conditions. E.g., if two nodes simultaneously issue conflicting (overlapping) publications, this may not be detected until both publications have reached a third node, in which case one of the publications will be silently dropped on that node. Hence, we end up with an inconsistent name table. In most cases this conflict is just a temporary race, e.g., one node is issuing a publication under the assumption that a previous, conflicting, publication has already been withdrawn by the other node. However, because of the (rtt related) distributed update delay, this may not yet hold true on all nodes. The symptom of this failure is a syslog message: "tipc: Cannot publish {%u,%u,%u}, overlap error". In this commit we add a resiliency queue at the receiving end of the name table distributor. When insertion of an arriving publication fails, we retain it in this queue for a short amount of time, assuming that another update will arrive very soon and clear the conflict. If so happens, we insert the publication, otherwise we drop it. The (configurable) retention value defaults to 2000 ms. Knowing from experience that the situation described above is extremely rare, there is no risk that the queue will accumulate any large number of items. Signed-off-by: Erik Hugne Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 16 ++++++++++ net/tipc/core.h | 1 + net/tipc/name_distr.c | 69 ++++++++++++++++++++++++++++++++++++++++++-- net/tipc/name_distr.h | 1 + net/tipc/name_table.c | 8 ++--- net/tipc/sysctl.c | 7 +++++ 6 files changed, 95 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 9a0319a82470..04892b821157 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -241,6 +241,9 @@ address of the router (or Connected) for internal networks. 6. TIPC ------------------------------------------------------- +tipc_rmem +---------- + The TIPC protocol now has a tunable for the receive memory, similar to the tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max) @@ -252,3 +255,16 @@ The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values are scaled (shifted) versions of that same value. Note that the min value is not at this point in time used in any meaningful way, but the triplet is preserved in order to be consistent with things like tcp_rmem. + +named_timeout +-------------- + +TIPC name table updates are distributed asynchronously in a cluster, without +any form of transaction handling. This means that different race scenarios are +possible. One such is that a name withdrawal sent out by one node and received +by another node may arrive after a second, overlapping name publication already +has been accepted from a third node, although the conflicting updates +originally may have been issued in the correct sequential order. +If named_timeout is nonzero, failed topology updates will be placed on a defer +queue until another event arrives that clears the error, or until the timeout +expires. Value is in milliseconds. diff --git a/net/tipc/core.h b/net/tipc/core.h index d2607a8e2b80..f773b148722f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -81,6 +81,7 @@ extern u32 tipc_own_addr __read_mostly; extern int tipc_max_ports __read_mostly; extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; +extern int sysctl_tipc_named_timeout __read_mostly; /* * Other global variables diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 0591f33b8384..780ef710a849 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -1,7 +1,7 @@ /* * net/tipc/name_distr.c: TIPC name distribution code * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -71,6 +71,21 @@ static struct publ_list *publ_lists[] = { }; +int sysctl_tipc_named_timeout __read_mostly = 2000; + +/** + * struct tipc_dist_queue - queue holding deferred name table updates + */ +static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue); + +struct distr_queue_item { + struct distr_item i; + u32 dtype; + u32 node; + unsigned long expires; + struct list_head next; +}; + /** * publ_to_item - add publication info to a publication message */ @@ -298,6 +313,52 @@ struct publication *tipc_update_nametbl(struct distr_item *i, u32 node, return publ; } +/** + * tipc_named_add_backlog - add a failed name table update to the backlog + * + */ +static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) +{ + struct distr_queue_item *e; + unsigned long now = get_jiffies_64(); + + e = kzalloc(sizeof(*e), GFP_ATOMIC); + if (!e) + return; + e->dtype = type; + e->node = node; + e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); + memcpy(e, i, sizeof(*i)); + list_add_tail(&e->next, &tipc_dist_queue); +} + +/** + * tipc_named_process_backlog - try to process any pending name table updates + * from the network. + */ +void tipc_named_process_backlog(void) +{ + struct distr_queue_item *e, *tmp; + char addr[16]; + unsigned long now = get_jiffies_64(); + + list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) { + if (time_after(e->expires, now)) { + if (!tipc_update_nametbl(&e->i, e->node, e->dtype)) + continue; + } else { + tipc_addr_string_fill(addr, e->node); + pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", + e->dtype, ntohl(e->i.type), + ntohl(e->i.lower), + ntohl(e->i.upper), + addr, ntohl(e->i.key)); + } + list_del(&e->next); + kfree(e); + } +} + /** * tipc_named_rcv - process name table update message sent by another node */ @@ -306,13 +367,15 @@ void tipc_named_rcv(struct sk_buff *buf) struct tipc_msg *msg = buf_msg(buf); struct distr_item *item = (struct distr_item *)msg_data(msg); u32 count = msg_data_sz(msg) / ITEM_SIZE; + u32 node = msg_orignode(msg); write_lock_bh(&tipc_nametbl_lock); while (count--) { - tipc_update_nametbl(item, msg_orignode(msg), - msg_type(msg)); + if (!tipc_update_nametbl(item, node, msg_type(msg))) + tipc_named_add_backlog(item, msg_type(msg), node); item++; } + tipc_named_process_backlog(); write_unlock_bh(&tipc_nametbl_lock); kfree_skb(buf); } diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 8afe32b7fc9a..b9e75feb3434 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -73,5 +73,6 @@ void named_cluster_distribute(struct sk_buff *buf); void tipc_named_node_up(u32 dnode); void tipc_named_rcv(struct sk_buff *buf); void tipc_named_reinit(void); +void tipc_named_process_backlog(void); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index c058e30f84aa..3a6a0a7c0759 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -261,8 +261,6 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, /* Lower end overlaps existing entry => need an exact match */ if ((sseq->lower != lower) || (sseq->upper != upper)) { - pr_warn("Cannot publish {%u,%u,%u}, overlap error\n", - type, lower, upper); return NULL; } @@ -284,8 +282,6 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, /* Fail if upper end overlaps into an existing entry */ if ((inspos < nseq->first_free) && (upper >= nseq->sseqs[inspos].lower)) { - pr_warn("Cannot publish {%u,%u,%u}, overlap error\n", - type, lower, upper); return NULL; } @@ -677,6 +673,8 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, if (likely(publ)) { table.local_publ_count++; buf = tipc_named_publish(publ); + /* Any pending external events? */ + tipc_named_process_backlog(); } write_unlock_bh(&tipc_nametbl_lock); @@ -698,6 +696,8 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) if (likely(publ)) { table.local_publ_count--; buf = tipc_named_withdraw(publ); + /* Any pending external events? */ + tipc_named_process_backlog(); write_unlock_bh(&tipc_nametbl_lock); list_del_init(&publ->pport_list); kfree(publ); diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index f3fef93325a8..1a779b1e8510 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -47,6 +47,13 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "named_timeout", + .data = &sysctl_tipc_named_timeout, + .maxlen = sizeof(sysctl_tipc_named_timeout), + .mode = 0644, + .proc_handler = proc_dointvec, + }, {} }; -- cgit v1.2.3 From afb84b6261841f8ab387e267e748236fa805bea0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 28 Aug 2014 18:14:47 +0200 Subject: pktgen: add flag NO_TIMESTAMP to disable timestamping Then testing the TX limits of the stack, then it is useful to be-able to disable the do_gettimeofday() timetamping on every packet. This implements a pktgen flag NO_TIMESTAMP which will disable this call to do_gettimeofday(). The performance change on (my system E5-2695) with skb_clone=0, goes from TX 2,423,751 pps to 2,567,165 pps with flag NO_TIMESTAMP. Thus, the cost of do_gettimeofday() or saving is approx 23 nanosec. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5b36a9428c59..21cb4839bc97 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -202,6 +202,7 @@ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ #define F_UDPCSUM (1<<16) /* Include UDP checksum */ +#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -638,6 +639,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPCSUM) seq_puts(seq, "UDPCSUM "); + if (pkt_dev->flags & F_NO_TIMESTAMP) + seq_puts(seq, "NO_TIMESTAMP "); + if (pkt_dev->flags & F_MPLS_RND) seq_puts(seq, "MPLS_RND "); @@ -1243,6 +1247,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!UDPCSUM") == 0) pkt_dev->flags &= ~F_UDPCSUM; + else if (strcmp(f, "NO_TIMESTAMP") == 0) + pkt_dev->flags |= F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -1251,6 +1258,7 @@ static ssize_t pktgen_if_write(struct file *file, "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " + "NO_TIMESTAMP, " #ifdef CONFIG_XFRM "IPSEC, " #endif @@ -2685,9 +2693,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->pgh_magic = htonl(PKTGEN_MAGIC); pgh->seq_num = htonl(pkt_dev->seq_num); - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + if (pkt_dev->flags & F_NO_TIMESTAMP) { + pgh->tv_sec = 0; + pgh->tv_usec = 0; + } else { + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, -- cgit v1.2.3 From 688d1945bc89bd585ec67b5b83121f499e6290bb Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 29 Aug 2014 23:32:05 -0700 Subject: tcp: whitespace fixes Fix places where there is space before tab, long lines, and awkward if(){, double spacing etc. Add blank line after declaration/initialization. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 11 ++-- net/ipv4/tcp_cong.c | 5 +- net/ipv4/tcp_cubic.c | 18 +++--- net/ipv4/tcp_diag.c | 5 +- net/ipv4/tcp_highspeed.c | 145 +++++++++++++++++++++++------------------------ net/ipv4/tcp_htcp.c | 6 +- net/ipv4/tcp_hybla.c | 1 - net/ipv4/tcp_illinois.c | 3 +- net/ipv4/tcp_ipv4.c | 5 +- net/ipv4/tcp_probe.c | 6 +- net/ipv4/tcp_scalable.c | 2 +- net/ipv4/tcp_vegas.c | 3 - net/ipv4/tcp_veno.c | 1 - net/ipv4/tcp_westwood.c | 7 +-- net/ipv4/tcp_yeah.c | 9 +-- 15 files changed, 104 insertions(+), 123 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index d5de69bc04f5..bb395d46a389 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -17,7 +17,6 @@ #include #include - #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ @@ -46,11 +45,10 @@ MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); - /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ - u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ @@ -103,7 +101,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* binary increase */ if (cwnd < ca->last_max_cwnd) { - __u32 dist = (ca->last_max_cwnd - cwnd) + __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B; if (dist > max_increment) @@ -154,7 +152,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); } - } /* @@ -177,7 +174,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->loss_cwnd = tp->snd_cwnd; - if (tp->snd_cwnd <= low_window) return max(tp->snd_cwnd >> 1U, 2U); else @@ -188,6 +184,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct bictcp *ca = inet_csk_ca(sk); + return max(tp->snd_cwnd, ca->loss_cwnd); } @@ -206,12 +203,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } } - static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 7b09d8b49fa5..80248f56c89f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -142,7 +142,6 @@ static int __init tcp_congestion_default(void) } late_initcall(tcp_congestion_default); - /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { @@ -154,7 +153,6 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); - } rcu_read_unlock(); } @@ -186,7 +184,6 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); - } rcu_read_unlock(); } @@ -230,7 +227,6 @@ out: return ret; } - /* Change congestion control for socket */ int tcp_set_congestion_control(struct sock *sk, const char *name) { @@ -337,6 +333,7 @@ EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9bd8a4828a9..20de0118c98e 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -82,12 +82,13 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ - u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ - u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 bic_K; /* time to origin point + from the beginning of the current epoch */ u32 delay_min; /* min delay (msec << 3) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ @@ -219,7 +220,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->last_time = tcp_time_stamp; if (ca->epoch_start == 0) { - ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; /* record beginning */ ca->ack_cnt = 1; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ @@ -263,9 +264,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* c/rtt * (t-K)^3 */ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); - if (t < ca->bic_K) /* below origin*/ + if (t < ca->bic_K) /* below origin*/ bic_target = ca->bic_origin_point - delta; - else /* above origin*/ + else /* above origin*/ bic_target = ca->bic_origin_point + delta; /* cubic function - calc bictcp_cnt*/ @@ -285,13 +286,14 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; while (ca->ack_cnt > delta) { /* update tcp cwnd */ ca->ack_cnt -= delta; ca->tcp_cwnd++; } - if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ delta = ca->tcp_cwnd - cwnd; max_cnt = cwnd / delta; if (ca->cnt > max_cnt) @@ -320,7 +322,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); } - } static u32 bictcp_recalc_ssthresh(struct sock *sk) @@ -452,7 +453,8 @@ static int __init cubictcp_register(void) * based on SRTT of 100ms */ - beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 + / (BICTCP_BETA_SCALE - beta); cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index ed3f2ad42e0f..0d73f9ddb55b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -9,7 +9,6 @@ * 2 of the License, or (at your option) any later version. */ - #include #include @@ -35,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + struct inet_diag_req_v2 *r, struct nlattr *bc) { inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); } static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + struct inet_diag_req_v2 *req) { return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 1c4908280d92..882c08aae2f5 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -9,7 +9,6 @@ #include #include - /* From AIMD tables from RFC 3649 appendix B, * with fixed-point MD scaled <<8. */ @@ -17,78 +16,78 @@ static const struct hstcp_aimd_val { unsigned int cwnd; unsigned int md; } hstcp_aimd_vals[] = { - { 38, 128, /* 0.50 */ }, - { 118, 112, /* 0.44 */ }, - { 221, 104, /* 0.41 */ }, - { 347, 98, /* 0.38 */ }, - { 495, 93, /* 0.37 */ }, - { 663, 89, /* 0.35 */ }, - { 851, 86, /* 0.34 */ }, - { 1058, 83, /* 0.33 */ }, - { 1284, 81, /* 0.32 */ }, - { 1529, 78, /* 0.31 */ }, - { 1793, 76, /* 0.30 */ }, - { 2076, 74, /* 0.29 */ }, - { 2378, 72, /* 0.28 */ }, - { 2699, 71, /* 0.28 */ }, - { 3039, 69, /* 0.27 */ }, - { 3399, 68, /* 0.27 */ }, - { 3778, 66, /* 0.26 */ }, - { 4177, 65, /* 0.26 */ }, - { 4596, 64, /* 0.25 */ }, - { 5036, 62, /* 0.25 */ }, - { 5497, 61, /* 0.24 */ }, - { 5979, 60, /* 0.24 */ }, - { 6483, 59, /* 0.23 */ }, - { 7009, 58, /* 0.23 */ }, - { 7558, 57, /* 0.22 */ }, - { 8130, 56, /* 0.22 */ }, - { 8726, 55, /* 0.22 */ }, - { 9346, 54, /* 0.21 */ }, - { 9991, 53, /* 0.21 */ }, - { 10661, 52, /* 0.21 */ }, - { 11358, 52, /* 0.20 */ }, - { 12082, 51, /* 0.20 */ }, - { 12834, 50, /* 0.20 */ }, - { 13614, 49, /* 0.19 */ }, - { 14424, 48, /* 0.19 */ }, - { 15265, 48, /* 0.19 */ }, - { 16137, 47, /* 0.19 */ }, - { 17042, 46, /* 0.18 */ }, - { 17981, 45, /* 0.18 */ }, - { 18955, 45, /* 0.18 */ }, - { 19965, 44, /* 0.17 */ }, - { 21013, 43, /* 0.17 */ }, - { 22101, 43, /* 0.17 */ }, - { 23230, 42, /* 0.17 */ }, - { 24402, 41, /* 0.16 */ }, - { 25618, 41, /* 0.16 */ }, - { 26881, 40, /* 0.16 */ }, - { 28193, 39, /* 0.16 */ }, - { 29557, 39, /* 0.15 */ }, - { 30975, 38, /* 0.15 */ }, - { 32450, 38, /* 0.15 */ }, - { 33986, 37, /* 0.15 */ }, - { 35586, 36, /* 0.14 */ }, - { 37253, 36, /* 0.14 */ }, - { 38992, 35, /* 0.14 */ }, - { 40808, 35, /* 0.14 */ }, - { 42707, 34, /* 0.13 */ }, - { 44694, 33, /* 0.13 */ }, - { 46776, 33, /* 0.13 */ }, - { 48961, 32, /* 0.13 */ }, - { 51258, 32, /* 0.13 */ }, - { 53677, 31, /* 0.12 */ }, - { 56230, 30, /* 0.12 */ }, - { 58932, 30, /* 0.12 */ }, - { 61799, 29, /* 0.12 */ }, - { 64851, 28, /* 0.11 */ }, - { 68113, 28, /* 0.11 */ }, - { 71617, 27, /* 0.11 */ }, - { 75401, 26, /* 0.10 */ }, - { 79517, 26, /* 0.10 */ }, - { 84035, 25, /* 0.10 */ }, - { 89053, 24, /* 0.10 */ }, + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, }; #define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 031361311a8b..58469fff6c18 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -98,7 +98,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) } } -static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt) +static void measure_achieved_throughput(struct sock *sk, + u32 pkts_acked, s32 rtt) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); @@ -148,8 +149,8 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) if (use_bandwidth_switch) { u32 maxB = ca->maxB; u32 old_maxB = ca->old_maxB; - ca->old_maxB = ca->maxB; + ca->old_maxB = ca->maxB; if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { ca->beta = BETA_MIN; ca->modeswitch = 0; @@ -270,6 +271,7 @@ static void htcp_state(struct sock *sk, u8 new_state) case TCP_CA_Open: { struct htcp *ca = inet_csk_ca(sk); + if (ca->undo_last_cong) { ca->last_cong = jiffies; ca->undo_last_cong = 0; diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index d8f8f05a4951..f963b274f2b0 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -29,7 +29,6 @@ static int rtt0 = 25; module_param(rtt0, int, 0644); MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); - /* This is called to refresh values for hybla parameters */ static inline void hybla_recalc_param (struct sock *sk) { diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 5999b3972e64..1d5a30a90adf 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -284,7 +284,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; if (delta >= tp->snd_cwnd) { tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, - (u32) tp->snd_cwnd_clamp); + (u32)tp->snd_cwnd_clamp); tp->snd_cwnd_cnt = 0; } } @@ -299,7 +299,6 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } - /* Extract info for Tcp socket info provided via netlink. */ static void tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cd17f009aede..487e2a41667f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -90,7 +90,6 @@ int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; EXPORT_SYMBOL(sysctl_tcp_low_latency); - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -1269,7 +1268,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, + .syn_ack_timeout = tcp_syn_ack_timeout, }; static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { @@ -2183,7 +2182,7 @@ int tcp_seq_open(struct inode *inode, struct file *file) s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; - s->last_pos = 0; + s->last_pos = 0; return 0; } EXPORT_SYMBOL(tcp_seq_open); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 3b66610d4156..ebf5ff57526e 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -83,7 +83,6 @@ static struct { struct tcp_log *log; } tcp_probe; - static inline int tcp_probe_used(void) { return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); @@ -101,7 +100,6 @@ static inline int tcp_probe_avail(void) si4.sin_addr.s_addr = inet->inet_##mem##addr; \ } while (0) \ - /* * Hook inserted to be called before each receive packet. * Note: arguments must match tcp_rcv_established()! @@ -194,8 +192,8 @@ static int tcpprobe_sprint(char *tbuf, int n) return scnprintf(tbuf, n, "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long) tv.tv_sec, - (unsigned long) tv.tv_nsec, + (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_nsec, &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 8250949b8853..6824afb65d93 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -31,10 +31,10 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } - static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index b40ad897f945..a6afde666ab1 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -51,7 +51,6 @@ MODULE_PARM_DESC(beta, "upper bound of packets in network"); module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); - /* There are several situations when we must "re-start" Vegas: * * o when a connection is established @@ -133,7 +132,6 @@ EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); void tcp_vegas_state(struct sock *sk, u8 ca_state) { - if (ca_state == TCP_CA_Open) vegas_enable(sk); else @@ -285,7 +283,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* Use normal slow start */ else if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp, acked); - } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 8276977d2c85..a4d2d2d88dca 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -175,7 +175,6 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) } else tp->snd_cwnd_cnt++; } - } if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b94a04ae2ed5..81911a92356c 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -42,7 +42,6 @@ struct westwood { u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; - /* TCP Westwood functions and constants */ #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ @@ -153,7 +152,6 @@ static inline void update_rtt_min(struct westwood *w) w->rtt_min = min(w->rtt, w->rtt_min); } - /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when @@ -208,7 +206,6 @@ static inline u32 westwood_acked_count(struct sock *sk) return w->cumul_ack; } - /* * TCP Westwood * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it @@ -219,6 +216,7 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } @@ -254,12 +252,12 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } } - /* Extract info for Tcp socket info provided via netlink. */ static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct westwood *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct tcpvegas_info info = { .tcpv_enabled = 1, @@ -271,7 +269,6 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, } } - static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 599b79b8eac0..cd7273218598 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -54,10 +54,8 @@ static void tcp_yeah_init(struct sock *sk) /* Ensure the MD arithmetic works. This is somewhat pedantic, * since I don't think we will see a cwnd this large. :) */ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); - } - static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -84,7 +82,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* Scalable */ tp->snd_cwnd_cnt += yeah->pkts_acked; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; @@ -120,7 +118,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ if (after(ack, yeah->vegas.beg_snd_nxt)) { - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. @@ -189,7 +186,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } yeah->lastQ = queue; - } /* Save the extent of the current window so we can use this @@ -205,7 +201,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } } -static u32 tcp_yeah_ssthresh(struct sock *sk) { +static u32 tcp_yeah_ssthresh(struct sock *sk) +{ const struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); u32 reduction; -- cgit v1.2.3 From 61b7363ffa48b36e2ff086c2d2524e40d3766571 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 29 Aug 2014 12:42:07 -0700 Subject: net: dsa: make dsa_pack_type static net/dsa/dsa.c:624:20: sparse: symbol 'dsa_pack_type' was not declared. Should it be static? Fixes: 3e8a72d1dae374 ("net: dsa: reduce number of protocol hooks") Signed-off-by: Fengguang Wu Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 484f695351a7..61f145c44555 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -629,7 +629,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, return dst->ops->rcv(skb, dev, pt, orig_dev); } -struct packet_type dsa_pack_type __read_mostly = { +static struct packet_type dsa_pack_type __read_mostly = { .type = cpu_to_be16(ETH_P_XDSA), .func = dsa_switch_rcv, }; -- cgit v1.2.3 From 5a21232983aa7acfe7fd26170832a9e0a4a7b4ae Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:41 -0700 Subject: net: Support for csum_bad in skbuff This flag indicates that an invalid checksum was detected in the packet. __skb_mark_checksum_bad helper function was added to set this. Checksums can be marked bad from a driver or the GRO path (the latter is implemented in this patch). csum_bad is checked in __skb_checksum_validate_complete (i.e. calling that when ip_summed == CHECKSUM_NONE). csum_bad works in conjunction with ip_summed value. In the case that ip_summed is CHECKSUM_NONE and csum_bad is set, this implies that the first (or next) checksum encountered in the packet is bad. When ip_summed is CHECKSUM_UNNECESSARY, the first checksum after the last one validated is bad. For example, if ip_summed == CHECKSUM_UNNECESSARY, csum_level == 1, and csum_bad is set-- then the third checksum in the packet is bad. In the normal path, the packet will be dropped when processing the protocol layer of the bad checksum: __skb_decr_checksum_unnecessary called twice for the good checksums changing ip_summed to CHECKSUM_NONE so that __skb_checksum_validate_complete is called to validate the third checksum and that will fail since csum_bad is set. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- include/linux/skbuff.h | 21 ++++++++++++++++++++- net/core/dev.c | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 202c25a9aadf..a0ab6d9d400a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2216,7 +2216,9 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ - if (!__ret) \ + if (__ret) \ + __skb_mark_checksum_bad(skb); \ + else \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c93b5859a772..23710a243439 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -617,7 +617,8 @@ struct sk_buff { kmemcheck_bitfield_begin(flags3); __u8 csum_level:2; - /* 14 bit hole */ + __u8 csum_bad:1; + /* 13 bit hole */ kmemcheck_bitfield_end(flags3); __be16 inner_protocol; @@ -2825,6 +2826,21 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_mark_checksum_bad(struct sk_buff *skb) +{ + /* Mark current checksum as bad (typically called from GRO + * path). In the case that ip_summed is CHECKSUM_NONE + * this must be the first checksum encountered in the packet. + * When ip_summed is CHECKSUM_UNNECESSARY, this is the first + * checksum after the last one validated. For UDP, a zero + * checksum can not be marked as bad. + */ + + if (skb->ip_summed == CHECKSUM_NONE || + skb->ip_summed == CHECKSUM_UNNECESSARY) + skb->csum_bad = 1; +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -2866,6 +2882,9 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, skb->csum_valid = 1; return 0; } + } else if (skb->csum_bad) { + /* ip_summed == CHECKSUM_NONE in this case */ + return 1; } skb->csum = psum; diff --git a/net/core/dev.c b/net/core/dev.c index 6857d57aa294..3774afc3bebf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3918,7 +3918,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); -- cgit v1.2.3 From 2abb7cdc0dc84e99b76ef983a1ae1978922aa9b3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:43 -0700 Subject: udp: Add support for doing checksum unnecessary conversion Add support for doing CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion in UDP tunneling path. In the normal UDP path, we call skb_checksum_try_convert after locating the UDP socket. The check is that checksum conversion is enabled for the socket (new flag in UDP socket) and that checksum field is non-zero. In the UDP GRO path, we call skb_gro_checksum_try_convert after checksum is validated and checksum field is non-zero. Since this is already in GRO we assume that checksum conversion is always wanted. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/udp.h | 16 +++++++++++++++- net/ipv4/udp.c | 4 ++++ net/ipv4/udp_offload.c | 25 +++++++++++++++++-------- net/ipv6/udp.c | 4 ++++ net/ipv6/udp_offload.c | 24 +++++++++++++++++------- 5 files changed, 57 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 247cfdcc4b08..ee3277593222 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -49,7 +49,11 @@ struct udp_sock { unsigned int corkflag; /* Cork is required */ __u8 encap_type; /* Is this an Encapsulation socket? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ - no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */ + no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ + convert_csum:1;/* On receive, convert checksum + * unnecessary to checksum complete + * if possible. + */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -98,6 +102,16 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) return udp_sk(sk)->no_check6_rx; } +static inline void udp_set_convert_csum(struct sock *sk, bool val) +{ + udp_sk(sk)->convert_csum = val; +} + +static inline bool udp_get_convert_csum(struct sock *sk) +{ + return udp_sk(sk)->convert_csum; +} + #define udp_portaddr_for_each_entry(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3549c21fe5f7..0da3849fd35b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1788,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a6adff98382a..84e0e05c9c0e 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -290,16 +290,25 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, { struct udphdr *uh = udp_gro_udphdr(skb); - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (unlikely(!uh) || - (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, - inet_gro_compute_pseudo))) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + if (unlikely(!uh)) + goto flush; + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (!NAPI_GRO_CB(skb)->flush) + goto skip; + + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo); +skip: return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } int udp_gro_complete(struct sk_buff *skb, int nhoff) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 12fcce8fba46..f6ba535b6feb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -891,6 +891,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; } + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_compute_pseudo); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b13e377e9c53..89cb9a9b8537 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -134,16 +134,26 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, { struct udphdr *uh = udp_gro_udphdr(skb); + if (unlikely(!uh)) + goto flush; + /* Don't bother verifying checksum if we're going to flush anyway. */ - if (unlikely(!uh) || - (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, - ip6_gro_compute_pseudo))) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + if (!NAPI_GRO_CB(skb)->flush) + goto skip; + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo); + +skip: return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } int udp6_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3 From 884d338c041c2aa4536ade8620efa585e7c57f3c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:44 -0700 Subject: gre: Add support for checksum unnecessary conversions Call skb_checksum_try_convert and skb_gro_checksum_try_convert after checksum is found present and validated in the GRE header for normal and GRO paths respectively. In GRO path, call skb_gro_checksum_try_convert Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 4 ++++ net/ipv4/gre_offload.c | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 0485bf7f8f03..7e0756da8737 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -125,6 +125,10 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, *csum_err = true; return -EINVAL; } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index a4d7965fb880..d3fe2ac05167 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -172,10 +172,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, } /* Don't bother verifying checksum if we're going to flush anyway. */ - if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_simple_validate(skb)) + if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) { + if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + } + flush = 0; for (p = *head; p; p = p->next) { -- cgit v1.2.3 From 72297c59f7188d12f24daee1877e7dcca787cf1f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:46 -0700 Subject: l2tp: Enable checksum unnecessary conversions for l2tp/UDP sockets Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1109d3bb8dac..797c0af71c06 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1392,6 +1392,8 @@ static int l2tp_tunnel_sock_create(struct net *net, if (err < 0) goto out; + udp_set_convert_csum(sock->sk, true); + break; case L2TP_ENCAPTYPE_IP: -- cgit v1.2.3 From 364a9e93243d1785f310c0964af0e24bf1adac03 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 31 Aug 2014 21:30:27 -0400 Subject: sock: deduplicate errqueue dequeue sk->sk_error_queue is dequeued in four locations. All share the exact same logic. Deduplicate. Also collapse the two critical sections for dequeue (at the top of the recv handler) and signal (at the bottom). This moves signal generation for the next packet forward, which should be harmless. It also changes the behavior if the recv handler exits early with an error. Previously, a signal for follow-up packets on the errqueue would then not be scheduled. The new behavior, to always signal, is arguably a bug fix. For rxrpc, the change causes the same function to be called repeatedly for each queued packet (because the recv handler == sk_error_report). It is likely that all packets will fail for the same reason (e.g., memory exhaustion). This code runs without sk_lock held, so it is not safe to trust that sk->sk_err is immutable inbetween releasing q->lock and the subsequent test. Introduce int err just to avoid this potential race. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/skbuff.c | 20 ++++++++++++++++++++ net/core/sock.c | 14 ++------------ net/ipv4/ip_sockglue.c | 15 ++------------- net/ipv6/datagram.c | 15 ++------------- net/rxrpc/ar-error.c | 14 +------------- 6 files changed, 28 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 7f2ab72f321a..3fde6130789d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2041,6 +2041,7 @@ void sk_stop_timer(struct sock *sk, struct timer_list *timer); int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); +struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 163b673f9e62..53ce536e3d6e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3491,6 +3491,26 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) +{ + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next; + int err = 0; + + spin_lock_bh(&q->lock); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) + err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + spin_unlock_bh(&q->lock); + + sk->sk_err = err; + if (err) + sk->sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + void __skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) diff --git a/net/core/sock.c b/net/core/sock.c index f7f2352200ad..f1a638ee93d9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2488,11 +2488,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; int copied, err; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -2513,16 +2513,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5cb830c78990..455e75bcb167 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -405,7 +405,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; @@ -415,7 +415,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int copied; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -462,17 +462,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - skb2 = skb_peek(&sk->sk_error_queue); - if (skb2 != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1844e874a350..2cdc38338be3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -332,7 +332,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; @@ -342,7 +342,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int copied; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -415,17 +415,6 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else { - spin_unlock_bh(&sk->sk_error_queue.lock); - } - out_free_skb: kfree_skb(skb); out: diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index db57458c824c..74c0fcd36838 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -37,7 +37,7 @@ void rxrpc_UDP_error_report(struct sock *sk) _enter("%p{%d}", sk, local->debug_id); - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (!skb) { _leave("UDP socket errqueue empty"); return; @@ -111,18 +111,6 @@ void rxrpc_UDP_error_report(struct sock *sk) skb_queue_tail(&trans->error_queue, skb); rxrpc_queue_work(&trans->error_handler); - /* reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - skb = skb_peek(&sk->sk_error_queue); - if (skb) { - sk->sk_err = SKB_EXT_ERR(skb)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else { - spin_unlock_bh(&sk->sk_error_queue.lock); - } - _leave(""); } -- cgit v1.2.3 From b58555f1767c9f4e330fcf168e4e753d2d9196e0 Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Fri, 29 Aug 2014 16:16:04 +0200 Subject: xfrm: hash prefixed policies based on preflen thresholds The idea is an extension of the current policy hashing. Today only non-prefixed policies are stored in a hash table. This patch relaxes the constraints, and hashes policies whose prefix lengths are greater or equal to a configurable threshold. Each hash table (one per direction) maintains its own set of IPv4 and IPv6 thresholds (dbits4, sbits4, dbits6, sbits6), by default (32, 32, 128, 128). Example, if the output hash table is configured with values (16, 24, 56, 64): ip xfrm policy add dir out src 10.22.0.0/20 dst 10.24.1.0/24 ... => hashed ip xfrm policy add dir out src 10.22.0.0/16 dst 10.24.1.1/32 ... => hashed ip xfrm policy add dir out src 10.22.0.0/16 dst 10.24.0.0/16 ... => unhashed ip xfrm policy add dir out \ src 3ffe:304:124:2200::/60 dst 3ffe:304:124:2401::/64 ... => hashed ip xfrm policy add dir out \ src 3ffe:304:124:2200::/56 dst 3ffe:304:124:2401::2/128 ... => hashed ip xfrm policy add dir out \ src 3ffe:304:124:2200::/56 dst 3ffe:304:124:2400::/56 ... => unhashed The high order bits of the addresses (up to the threshold) are used to compute the hash key. Signed-off-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 4 +++ net/xfrm/xfrm_hash.h | 76 +++++++++++++++++++++++++++++++++++++++++------- net/xfrm/xfrm_policy.c | 53 +++++++++++++++++++++++++++++---- 3 files changed, 117 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 3492434baf88..41902a8103bd 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -13,6 +13,10 @@ struct ctl_table_header; struct xfrm_policy_hash { struct hlist_head *table; unsigned int hmask; + u8 dbits4; + u8 sbits4; + u8 dbits6; + u8 sbits6; }; struct netns_xfrm { diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 0622d319e1f2..666c5ffe929d 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -3,6 +3,7 @@ #include #include +#include static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) { @@ -28,6 +29,58 @@ static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, saddr->a6[2] ^ saddr->a6[3]); } +static inline u32 __bits2mask32(__u8 bits) +{ + u32 mask32 = 0xffffffff; + + if (bits == 0) + mask32 = 0; + else if (bits < 32) + mask32 <<= (32 - bits); + + return mask32; +} + +static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + __u8 dbits, + __u8 sbits) +{ + return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits), + ntohl(saddr->a4) & __bits2mask32(sbits), + 0); +} + +static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, + __u8 prefixlen) +{ + int pdw; + int pbi; + u32 initval = 0; + + pdw = prefixlen >> 5; /* num of whole u32 in prefix */ + pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ + + if (pbi) { + __be32 mask; + + mask = htonl((0xffffffff) << (32 - pbi)); + + initval = (__force u32)(addr->a6[pdw] & mask); + } + + return jhash2((__force u32 *)addr->a6, pdw, initval); +} + +static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + __u8 dbits, + __u8 sbits) +{ + return __xfrm6_pref_hash(daddr, dbits) ^ + __xfrm6_pref_hash(saddr, sbits); +} + static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family, @@ -84,7 +137,8 @@ static inline unsigned int __idx_hash(u32 index, unsigned int hmask) } static inline unsigned int __sel_hash(const struct xfrm_selector *sel, - unsigned short family, unsigned int hmask) + unsigned short family, unsigned int hmask, + u8 dbits, u8 sbits) { const xfrm_address_t *daddr = &sel->daddr; const xfrm_address_t *saddr = &sel->saddr; @@ -92,19 +146,19 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel, switch (family) { case AF_INET: - if (sel->prefixlen_d != 32 || - sel->prefixlen_s != 32) + if (sel->prefixlen_d < dbits || + sel->prefixlen_s < sbits) return hmask + 1; - h = __xfrm4_daddr_saddr_hash(daddr, saddr); + h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: - if (sel->prefixlen_d != 128 || - sel->prefixlen_s != 128) + if (sel->prefixlen_d < dbits || + sel->prefixlen_s < sbits) return hmask + 1; - h = __xfrm6_daddr_saddr_hash(daddr, saddr); + h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); @@ -113,17 +167,19 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel, static inline unsigned int __addr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, - unsigned short family, unsigned int hmask) + unsigned short family, + unsigned int hmask, + u8 dbits, u8 sbits) { unsigned int h = 0; switch (family) { case AF_INET: - h = __xfrm4_daddr_saddr_hash(daddr, saddr); + h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: - h = __xfrm6_daddr_saddr_hash(daddr, saddr); + h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index beeed602aeb3..e6ff7b4046ea 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -344,12 +344,39 @@ static inline unsigned int idx_hash(struct net *net, u32 index) return __idx_hash(index, net->xfrm.policy_idx_hmask); } +/* calculate policy hash thresholds */ +static void __get_hash_thresh(struct net *net, + unsigned short family, int dir, + u8 *dbits, u8 *sbits) +{ + switch (family) { + case AF_INET: + *dbits = net->xfrm.policy_bydst[dir].dbits4; + *sbits = net->xfrm.policy_bydst[dir].sbits4; + break; + + case AF_INET6: + *dbits = net->xfrm.policy_bydst[dir].dbits6; + *sbits = net->xfrm.policy_bydst[dir].sbits6; + break; + + default: + *dbits = 0; + *sbits = 0; + } +} + static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; - unsigned int hash = __sel_hash(sel, family, hmask); + unsigned int hash; + u8 dbits; + u8 sbits; + + __get_hash_thresh(net, family, dir, &dbits, &sbits); + hash = __sel_hash(sel, family, hmask, dbits, sbits); return (hash == hmask + 1 ? &net->xfrm.policy_inexact[dir] : @@ -362,25 +389,35 @@ static struct hlist_head *policy_hash_direct(struct net *net, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; - unsigned int hash = __addr_hash(daddr, saddr, family, hmask); + unsigned int hash; + u8 dbits; + u8 sbits; + + __get_hash_thresh(net, family, dir, &dbits, &sbits); + hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return net->xfrm.policy_bydst[dir].table + hash; } -static void xfrm_dst_hash_transfer(struct hlist_head *list, +static void xfrm_dst_hash_transfer(struct net *net, + struct hlist_head *list, struct hlist_head *ndsttable, - unsigned int nhashmask) + unsigned int nhashmask, + int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; + u8 dbits; + u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; + __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, - pol->family, nhashmask); + pol->family, nhashmask, dbits, sbits); if (!entry0) { hlist_del(&pol->bydst); hlist_add_head(&pol->bydst, ndsttable+h); @@ -434,7 +471,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) write_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) - xfrm_dst_hash_transfer(odst + i, ndst, nhashmask); + xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); net->xfrm.policy_bydst[dir].table = ndst; net->xfrm.policy_bydst[dir].hmask = nhashmask; @@ -2830,6 +2867,10 @@ static int __net_init xfrm_policy_init(struct net *net) if (!htab->table) goto out_bydst; htab->hmask = hmask; + htab->dbits4 = 32; + htab->sbits4 = 32; + htab->dbits6 = 128; + htab->sbits6 = 128; } INIT_LIST_HEAD(&net->xfrm.policy_all); -- cgit v1.2.3 From 880a6fab8f6ba5b5abe59ea68533202ddea1012c Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Fri, 29 Aug 2014 16:16:05 +0200 Subject: xfrm: configure policy hash table thresholds by netlink Enable to specify local and remote prefix length thresholds for the policy hash table via a netlink XFRM_MSG_NEWSPDINFO message. prefix length thresholds are specified by XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH optional attributes (struct xfrmu_spdhthresh). example: struct xfrmu_spdhthresh thresh4 = { .lbits = 0; .rbits = 24; }; struct xfrmu_spdhthresh thresh6 = { .lbits = 0; .rbits = 56; }; struct nlmsghdr *hdr; struct nl_msg *msg; msg = nlmsg_alloc(); hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, XFRMA_SPD_IPV4_HTHRESH, sizeof(__u32), NLM_F_REQUEST); nla_put(msg, XFRMA_SPD_IPV4_HTHRESH, sizeof(thresh4), &thresh4); nla_put(msg, XFRMA_SPD_IPV6_HTHRESH, sizeof(thresh6), &thresh6); nla_send_auto(sk, msg); The numbers are the policy selector minimum prefix lengths to put a policy in the hash table. - lbits is the local threshold (source address for out policies, destination address for in and fwd policies). - rbits is the remote threshold (destination address for out policies, source address for in and fwd policies). The default values are: XFRMA_SPD_IPV4_HTHRESH: 32 32 XFRMA_SPD_IPV6_HTHRESH: 128 128 Dynamic re-building of the SPD is performed when the thresholds values are changed. The current thresholds can be read via a XFRM_MSG_GETSPDINFO request: the kernel replies to XFRM_MSG_GETSPDINFO requests by an XFRM_MSG_NEWSPDINFO message, with both attributes XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH. Signed-off-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 10 ++++++ include/net/xfrm.h | 1 + include/uapi/linux/xfrm.h | 7 ++++ net/xfrm/xfrm_policy.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 80 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 182 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 41902a8103bd..9da798256f0e 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -19,6 +19,15 @@ struct xfrm_policy_hash { u8 sbits6; }; +struct xfrm_policy_hthresh { + struct work_struct work; + seqlock_t lock; + u8 lbits4; + u8 rbits4; + u8 lbits6; + u8 rbits6; +}; + struct netns_xfrm { struct list_head state_all; /* @@ -45,6 +54,7 @@ struct netns_xfrm { struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX * 2]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; + struct xfrm_policy_hthresh policy_hthresh; struct sock *nlsk; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 721e9c3b11bd..dc4865e90fe4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1591,6 +1591,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); +void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 25e5dd916ba4..02d5125a5ee8 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -328,6 +328,8 @@ enum xfrm_spdattr_type_t { XFRMA_SPD_UNSPEC, XFRMA_SPD_INFO, XFRMA_SPD_HINFO, + XFRMA_SPD_IPV4_HTHRESH, + XFRMA_SPD_IPV6_HTHRESH, __XFRMA_SPD_MAX #define XFRMA_SPD_MAX (__XFRMA_SPD_MAX - 1) @@ -347,6 +349,11 @@ struct xfrmu_spdhinfo { __u32 spdhmcnt; }; +struct xfrmu_spdhthresh { + __u8 lbits; + __u8 rbits; +}; + struct xfrm_usersa_info { struct xfrm_selector sel; struct xfrm_id id; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e6ff7b4046ea..55bcb8604bc6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -566,6 +566,86 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_rebuild(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, + xfrm.policy_hthresh.work); + unsigned int hmask; + struct xfrm_policy *pol; + struct xfrm_policy *policy; + struct hlist_head *chain; + struct hlist_head *odst; + struct hlist_node *newpos; + int i; + int dir; + unsigned seq; + u8 lbits4, rbits4, lbits6, rbits6; + + mutex_lock(&hash_resize_mutex); + + /* read selector prefixlen thresholds */ + do { + seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + lbits4 = net->xfrm.policy_hthresh.lbits4; + rbits4 = net->xfrm.policy_hthresh.rbits4; + lbits6 = net->xfrm.policy_hthresh.lbits6; + rbits6 = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); + + write_lock_bh(&net->xfrm.xfrm_policy_lock); + + /* reset the bydst and inexact table in all directions */ + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); + hmask = net->xfrm.policy_bydst[dir].hmask; + odst = net->xfrm.policy_bydst[dir].table; + for (i = hmask; i >= 0; i--) + INIT_HLIST_HEAD(odst + i); + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + /* dir out => dst = remote, src = local */ + net->xfrm.policy_bydst[dir].dbits4 = rbits4; + net->xfrm.policy_bydst[dir].sbits4 = lbits4; + net->xfrm.policy_bydst[dir].dbits6 = rbits6; + net->xfrm.policy_bydst[dir].sbits6 = lbits6; + } else { + /* dir in/fwd => dst = local, src = remote */ + net->xfrm.policy_bydst[dir].dbits4 = lbits4; + net->xfrm.policy_bydst[dir].sbits4 = rbits4; + net->xfrm.policy_bydst[dir].dbits6 = lbits6; + net->xfrm.policy_bydst[dir].sbits6 = rbits6; + } + } + + /* re-insert all policies by order of creation */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + newpos = NULL; + chain = policy_hash_bysel(net, &policy->selector, + policy->family, + xfrm_policy_id2dir(policy->index)); + hlist_for_each_entry(pol, chain, bydst) { + if (policy->priority >= pol->priority) + newpos = &pol->bydst; + else + break; + } + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, chain); + } + + write_unlock_bh(&net->xfrm.xfrm_policy_lock); + + mutex_unlock(&hash_resize_mutex); +} + +void xfrm_policy_hash_rebuild(struct net *net) +{ + schedule_work(&net->xfrm.policy_hthresh.work); +} +EXPORT_SYMBOL(xfrm_policy_hash_rebuild); + /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) @@ -2872,9 +2952,16 @@ static int __net_init xfrm_policy_init(struct net *net) htab->dbits6 = 128; htab->sbits6 = 128; } + net->xfrm.policy_hthresh.lbits4 = 32; + net->xfrm.policy_hthresh.rbits4 = 32; + net->xfrm.policy_hthresh.lbits6 = 128; + net->xfrm.policy_hthresh.rbits6 = 128; + + seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); + INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); if (net_eq(net, &init_net)) register_netdevice_notifier(&xfrm_dev_notifier); return 0; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d4db6ebb089d..eaf8a8f1cbe8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -964,7 +964,9 @@ static inline size_t xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_spdinfo)) - + nla_total_size(sizeof(struct xfrmu_spdhinfo)); + + nla_total_size(sizeof(struct xfrmu_spdhinfo)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)); } static int build_spdinfo(struct sk_buff *skb, struct net *net, @@ -973,9 +975,11 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, struct xfrmk_spdinfo si; struct xfrmu_spdinfo spc; struct xfrmu_spdhinfo sph; + struct xfrmu_spdhthresh spt4, spt6; struct nlmsghdr *nlh; int err; u32 *f; + unsigned lseq; nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); if (nlh == NULL) /* shouldn't really happen ... */ @@ -993,9 +997,22 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, sph.spdhcnt = si.spdhcnt; sph.spdhmcnt = si.spdhmcnt; + do { + lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + spt4.lbits = net->xfrm.policy_hthresh.lbits4; + spt4.rbits = net->xfrm.policy_hthresh.rbits4; + spt6.lbits = net->xfrm.policy_hthresh.lbits6; + spt6.rbits = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq)); + err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); if (!err) err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1004,6 +1021,51 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, return nlmsg_end(skb, nlh); } +static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrmu_spdhthresh *thresh4 = NULL; + struct xfrmu_spdhthresh *thresh6 = NULL; + + /* selector prefixlen thresholds to hash policies */ + if (attrs[XFRMA_SPD_IPV4_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh4)) + return -EINVAL; + thresh4 = nla_data(rta); + if (thresh4->lbits > 32 || thresh4->rbits > 32) + return -EINVAL; + } + if (attrs[XFRMA_SPD_IPV6_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh6)) + return -EINVAL; + thresh6 = nla_data(rta); + if (thresh6->lbits > 128 || thresh6->rbits > 128) + return -EINVAL; + } + + if (thresh4 || thresh6) { + write_seqlock(&net->xfrm.policy_hthresh.lock); + if (thresh4) { + net->xfrm.policy_hthresh.lbits4 = thresh4->lbits; + net->xfrm.policy_hthresh.rbits4 = thresh4->rbits; + } + if (thresh6) { + net->xfrm.policy_hthresh.lbits6 = thresh6->lbits; + net->xfrm.policy_hthresh.rbits6 = thresh6->rbits; + } + write_sequnlock(&net->xfrm.policy_hthresh.lock); + + xfrm_policy_hash_rebuild(net); + } + + return 0; +} + static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -2274,6 +2336,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), }; @@ -2308,10 +2371,17 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, }; +static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { + [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, + [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, +}; + static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); + const struct nla_policy *nla_pol; + int nla_max; } xfrm_dispatch[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, @@ -2335,6 +2405,9 @@ static const struct xfrm_link { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo, + .nla_pol = xfrma_spd_policy, + .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; @@ -2371,8 +2444,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } } - err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX, - xfrma_policy); + err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, + link->nla_max ? : XFRMA_MAX, + link->nla_pol ? : xfrma_policy); if (err < 0) return err; -- cgit v1.2.3 From 30766f4c2d60dd2a3fc67b7114174c417f43f4c6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 5 Aug 2014 20:02:42 +0200 Subject: netfilter: nat: move specific NAT IPv4 to core Move the specific NAT IPv4 core functions that are called from the hooks from iptable_nat.c to nf_nat_l3proto_ipv4.c. This prepares the ground to allow iptables and nft to use the same NAT engine code that comes in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l3proto.h | 38 +++++ net/ipv4/netfilter/iptable_nat.c | 233 +++++-------------------------- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 199 ++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 199 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index 5a2919b2e09a..bc2d51574489 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -42,6 +42,44 @@ const struct nf_nat_l3proto *__nf_nat_l3proto_find(u8 l3proto); int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum); + +unsigned int nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index f1787c04a4dd..6b67d7e9a75d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -28,222 +28,57 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) +static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int ret; - ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - ret = alloc_null_binding(ct, hooknum); - } - return ret; + return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table); } -static unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - - /* We never see fragments: conntrack defrags on pre-routing - * and local-out, and nf_nat_out protects post-routing. - */ - NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - unsigned int ret; - __be32 daddr = ip_hdr(skb)->daddr; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - daddr != ip_hdr(skb)->daddr) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if ((ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; + return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; + return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_in, + .hook = iptable_nat_ipv4_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, @@ -251,7 +86,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_out, + .hook = iptable_nat_ipv4_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, @@ -259,7 +94,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_local_fn, + .hook = iptable_nat_ipv4_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, @@ -267,7 +102,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_fn, + .hook = iptable_nat_ipv4_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 14f5ccd06337..fc37711e11f3 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -254,6 +254,205 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); +unsigned int +nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + /* maniptype == SRC for postrouting. */ + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + * and local-out, and nf_nat_out protects post-routing. + */ + NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + ops->hooknum)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = do_chain(ops, skb, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + + if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + break; + + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); + +unsigned int +nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + unsigned int ret; + __be32 daddr = ip_hdr(skb)->daddr; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + daddr != ip_hdr(skb)->daddr) + skb_dst_drop(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); + +unsigned int +nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); + +unsigned int +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn); + static int __init nf_nat_l3proto_ipv4_init(void) { int err; -- cgit v1.2.3 From 65cd90ac765fb6960f1e3815cc31972fc4599c37 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 5 Aug 2014 20:14:30 +0200 Subject: netfilter: nft_chain_nat_ipv4: use generic IPv4 NAT code from core Use the exported IPv4 NAT functions that are provided by the core. This removes duplicated code so iptables and nft use the same NAT codebase. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 157 ++++++++------------------------ 1 file changed, 37 insertions(+), 120 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 3964157d826c..df547bf50078 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -26,136 +26,53 @@ #include #include -/* - * NAT chains - */ - -static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); struct nft_pktinfo pkt; - unsigned int ret; - - if (ct == NULL || nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED + IP_CT_IS_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall through */ - case IP_CT_NEW: - if (nf_nat_initialized(ct, maniptype)) - break; - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - ret = nft_do_chain(&pkt, ops); - if (ret != NF_ACCEPT) - return ret; - if (!nf_nat_initialized(ct, maniptype)) { - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); - if (ret != NF_ACCEPT) - return ret; - } - default: - break; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nft_do_chain(&pkt, ops); } -static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - __be32 daddr = ip_hdr(skb)->daddr; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ip_hdr(skb)->daddr != daddr) { - skb_dst_drop(skb); - } - return ret; + return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo __maybe_unused; - const struct nf_conn *ct __maybe_unused; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip || - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all) - return nf_xfrm_me_harder(skb, AF_INET) == 0 ? - ret : NF_DROP; - } -#endif - return ret; + return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_output(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain); +} - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET)) - ret = NF_DROP; -#endif - } - return ret; +static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { @@ -168,10 +85,10 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { - [NF_INET_PRE_ROUTING] = nf_nat_prerouting, - [NF_INET_POST_ROUTING] = nf_nat_postrouting, - [NF_INET_LOCAL_OUT] = nf_nat_output, - [NF_INET_LOCAL_IN] = nf_nat_fn, + [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in, + [NF_INET_POST_ROUTING] = nft_nat_ipv4_out, + [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, + [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, }, }; -- cgit v1.2.3 From 5d1180fcacc5ceb7da5494acfe9c5e4ebad4f281 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 1 Sep 2014 16:07:26 +0200 Subject: rtnl/do_setlink(): set modified when IFLA_TXQLEN is updated The only effect of this patch is to print a warning if IFLA_TXQLEN is updated and a following change fails. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0493e3b7471..5bbaf74bf457 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1611,8 +1611,14 @@ static int do_setlink(const struct sk_buff *skb, modified = 1; } - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_TXQLEN]) { + unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); + + if (dev->tx_queue_len ^ value) + modified = 1; + + dev->tx_queue_len = value; + } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); -- cgit v1.2.3 From 1889b0e7efe8373793069bd3deb7702a51e6f2a5 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 1 Sep 2014 16:07:27 +0200 Subject: rtnl/do_setlink(): set modified when IFLA_LINKMODE is updated The only effect of this patch is to print a warning if IFLA_LINKMODE is updated and a following change fails. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5bbaf74bf457..2bd9fb15b987 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1624,8 +1624,12 @@ static int do_setlink(const struct sk_buff *skb, set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + write_lock_bh(&dev_base_lock); - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (dev->link_mode ^ value) + modified = 1; + dev->link_mode = value; write_unlock_bh(&dev_base_lock); } -- cgit v1.2.3 From 90c325e3bfe14ef360de6650fa2a2e92685e5cee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 1 Sep 2014 16:07:28 +0200 Subject: rtnl/do_setlink(): last arg is now a set of flags There is no functional changes with this commit, it only prepares the next one. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2bd9fb15b987..7d84db3e0597 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1481,9 +1481,10 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } +#define DO_SETLINK_MODIFIED 0x01 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, - struct nlattr **tb, char *ifname, int modified) + struct nlattr **tb, char *ifname, int status) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -1502,7 +1503,7 @@ static int do_setlink(const struct sk_buff *skb, put_net(net); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { @@ -1531,7 +1532,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_ADDRESS]) { @@ -1551,19 +1552,19 @@ static int do_setlink(const struct sk_buff *skb, kfree(sa); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); - modified = 1; + status |= DO_SETLINK_MODIFIED; } /* @@ -1575,7 +1576,7 @@ static int do_setlink(const struct sk_buff *skb, err = dev_change_name(dev, ifname); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { @@ -1583,7 +1584,7 @@ static int do_setlink(const struct sk_buff *skb, nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_BROADCAST]) { @@ -1601,21 +1602,21 @@ static int do_setlink(const struct sk_buff *skb, err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); if (dev->tx_queue_len ^ value) - modified = 1; + status |= DO_SETLINK_MODIFIED; dev->tx_queue_len = value; } @@ -1628,7 +1629,7 @@ static int do_setlink(const struct sk_buff *skb, write_lock_bh(&dev_base_lock); if (dev->link_mode ^ value) - modified = 1; + status |= DO_SETLINK_MODIFIED; dev->link_mode = value; write_unlock_bh(&dev_base_lock); } @@ -1644,7 +1645,7 @@ static int do_setlink(const struct sk_buff *skb, err = do_setvfinfo(dev, attr); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } } err = 0; @@ -1674,7 +1675,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } } err = 0; @@ -1692,7 +1693,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_AF_SPEC]) { @@ -1709,13 +1710,13 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } } err = 0; errout: - if (err < 0 && modified) + if (err < 0 && status & DO_SETLINK_MODIFIED) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); @@ -1999,7 +2000,7 @@ replay: } if (dev) { - int modified = 0; + int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; @@ -2014,7 +2015,7 @@ replay: err = ops->changelink(dev, tb, data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { @@ -2025,10 +2026,10 @@ replay: tb, slave_data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_MODIFIED; } - return do_setlink(skb, dev, ifm, tb, ifname, modified); + return do_setlink(skb, dev, ifm, tb, ifname, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { -- cgit v1.2.3 From ba9989069f4e426b1e0ed7018eacc9e1ba607095 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 1 Sep 2014 16:07:29 +0200 Subject: rtnl/do_setlink(): notify when a netdev is modified Depending on which parameters were updated, the changes were not propagated via the notifier chain and netlink. The new flag has been set only when the change did not cause a call to the notifier chain and/or to the netlink notification functions. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7d84db3e0597..a6882686ca3a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1482,6 +1482,8 @@ static int do_set_master(struct net_device *dev, int ifindex) } #define DO_SETLINK_MODIFIED 0x01 +/* notify flag means notify + modified. */ +#define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int status) @@ -1532,7 +1534,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { @@ -1564,7 +1566,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } /* @@ -1584,7 +1586,7 @@ static int do_setlink(const struct sk_buff *skb, nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { @@ -1616,7 +1618,7 @@ static int do_setlink(const struct sk_buff *skb, unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); if (dev->tx_queue_len ^ value) - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; dev->tx_queue_len = value; } @@ -1629,7 +1631,7 @@ static int do_setlink(const struct sk_buff *skb, write_lock_bh(&dev_base_lock); if (dev->link_mode ^ value) - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; dev->link_mode = value; write_unlock_bh(&dev_base_lock); } @@ -1645,7 +1647,7 @@ static int do_setlink(const struct sk_buff *skb, err = do_setvfinfo(dev, attr); if (err < 0) goto errout; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1675,7 +1677,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1693,7 +1695,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { @@ -1710,15 +1712,20 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } } err = 0; errout: - if (err < 0 && status & DO_SETLINK_MODIFIED) - net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", - dev->name); + if (status & DO_SETLINK_MODIFIED) { + if (status & DO_SETLINK_NOTIFY) + netdev_state_change(dev); + + if (err < 0) + net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", + dev->name); + } return err; } @@ -2015,7 +2022,7 @@ replay: err = ops->changelink(dev, tb, data); if (err < 0) return err; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { @@ -2026,7 +2033,7 @@ replay: tb, slave_data); if (err < 0) return err; - status |= DO_SETLINK_MODIFIED; + status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, ifm, tb, ifname, status); -- cgit v1.2.3 From 10770bc2d1702e05575db0072e1ebbc06d0b270e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 2 Sep 2014 16:35:33 +0200 Subject: qdisc: adjustments for API allowing skb list xmits Minor adjustments for merge commit 53fda7f7f9e (Merge branch 'xmit_list') that allows us to work with a list of SKBs. Update code doc to function sch_direct_xmit(). In handle_dev_cpu_collision() use kfree_skb_list() in error handling. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a8bf9f9928bd..5b261e91bdbd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -93,7 +93,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * detect it by checking xmit owner and drop the packet when * deadloop is detected. Return OK to try the next skb. */ - kfree_skb(skb); + kfree_skb_list(skb); net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", dev_queue->dev->name); ret = qdisc_qlen(q); @@ -110,9 +110,9 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, } /* - * Transmit one skb, and handle the return status as required. Holding the - * __QDISC___STATE_RUNNING bit guarantees that only one CPU can execute this - * function. + * Transmit possibly several skbs, and handle the return status as + * required. Holding the __QDISC___STATE_RUNNING bit guarantees that + * only one CPU can execute this function. * * Returns to the caller: * 0 - queue is empty or throttled. -- cgit v1.2.3 From 4549cf2b1803d29cfd019f7bfeaa784f8f9c558f Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Tue, 2 Sep 2014 14:05:10 +0200 Subject: mac80211: fix offloaded BA session traffic after hw restart When starting an offloaded BA session it is unknown what starting sequence number should be used. Using last_seq worked in most cases except after hw restart. When hw restart is requested last_seq is (rightfully so) kept unmodified. This ended up with BA sessions being restarted with an aribtrary BA window values resulting in dropped frames until sequence numbers caught up. Instead of last_seq pick seqno of a first Rxed frame of a given BA session. This fixes stalled traffic after hw restart with offloaded BA sessions (currently only ath10k). Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 5 +++-- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 14 +++----------- net/mac80211/rx.c | 10 ++++++++++ net/mac80211/sta_info.h | 3 +++ 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index f0e84bc48038..a48bad468880 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -227,7 +227,7 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx) + u16 buf_size, bool tx, bool auto_seq) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -326,6 +326,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->buf_size = buf_size; tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; + tid_agg_rx->auto_seq = auto_seq; status = WLAN_STATUS_SUCCESS; /* activate it for RX */ @@ -367,7 +368,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, true); + buf_size, true, false); } void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ffb20e5e6cf3..8fe5433f0d35 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1587,7 +1587,7 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx); + u16 buf_size, bool tx, bool auto_seq); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 01eede7406a5..bb7288e3c41c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1172,19 +1172,11 @@ static void ieee80211_iface_work(struct work_struct *work) rx_agg = (void *)&skb->cb; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, rx_agg->addr); - if (sta) { - u16 last_seq; - - last_seq = le16_to_cpu( - sta->last_seq_ctrl[rx_agg->tid]); - + if (sta) __ieee80211_start_rx_ba_session(sta, - 0, 0, - ieee80211_sn_inc(last_seq), - 1, rx_agg->tid, + 0, 0, 0, 1, rx_agg->tid, IEEE80211_MAX_AMPDU_BUF, - false); - } + false, true); mutex_unlock(&local->sta_mtx); } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_STOP) { rx_agg = (void *)&skb->cb; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a8d862f9183c..41eb12c87240 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -835,6 +835,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata spin_lock(&tid_agg_rx->reorder_lock); + /* + * Offloaded BA sessions have no known starting sequence number so pick + * one from first Rxed frame for this tid after BA was started. + */ + if (unlikely(tid_agg_rx->auto_seq)) { + tid_agg_rx->auto_seq = false; + tid_agg_rx->ssn = mpdu_seq_num; + tid_agg_rx->head_seq_num = mpdu_seq_num; + } + buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 89c40d5c0633..16dc1d414f69 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -167,6 +167,8 @@ struct tid_ampdu_tx { * @dialog_token: dialog token for aggregation session * @rcu_head: RCU head used for freeing this struct * @reorder_lock: serializes access to reorder buffer, see below. + * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and + * and ssn. * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -190,6 +192,7 @@ struct tid_ampdu_rx { u16 buf_size; u16 timeout; u8 dialog_token; + bool auto_seq; }; /** -- cgit v1.2.3 From 3f3c7eec60ad4f990d7bcbc41a1597a4fc7268f6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Sep 2014 12:12:50 +0200 Subject: qdisc: exit case fixes for skb list handling in qdisc layer More minor fixes to merge commit 53fda7f7f9e (Merge branch 'xmit_list') that allows us to work with a list of SKBs. Fixing exit cases in qdisc_reset() and qdisc_destroy(), where a leftover requeued SKB (qdisc->gso_skb) can have the potential of being a skb list, thus use kfree_skb_list(). This is a followup to commit 10770bc2d1 ("qdisc: adjustments for API allowing skb list xmits"). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5b261e91bdbd..19696ebe9ebc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -621,7 +621,7 @@ void qdisc_reset(struct Qdisc *qdisc) ops->reset(qdisc); if (qdisc->gso_skb) { - kfree_skb(qdisc->gso_skb); + kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; qdisc->q.qlen = 0; } @@ -657,7 +657,7 @@ void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - kfree_skb(qdisc->gso_skb); + kfree_skb_list(qdisc->gso_skb); /* * gen_estimator est_timer() might access qdisc->q.lock, * wait a RCU grace period before freeing qdisc. -- cgit v1.2.3 From 1f59533f9ca5634e7b8914252e48aee9d9cbe501 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Sep 2014 17:56:09 +0200 Subject: qdisc: validate frames going through the direct_xmit path In commit 50cbe9ab5f8d ("net: Validate xmit SKBs right when we pull them out of the qdisc") the validation code was moved out of dev_hard_start_xmit and into dequeue_skb. However this overlooked the fact that we do not always enqueue the skb onto a qdisc. First situation is if qdisc have flag TCQ_F_CAN_BYPASS and qdisc is empty. Second situation is if there is no qdisc on the device, which is a common case for software devices. Originally spotted and inital patch by Alexander Duyck. As a result Alex was seeing issues trying to connect to a vhost_net interface after commit 50cbe9ab5f8d was applied. Added a call to validate_xmit_skb() in __dev_xmit_skb(), in the code path for qdiscs with TCQ_F_CAN_BYPASS flag, and in __dev_queue_xmit() when no qdisc. Also handle the error situation where dev_hard_start_xmit() could return a skb list, and does not return dev_xmit_complete(rc) and falls through to the kfree_skb(), in that situation it should call kfree_skb_list(). Fixes: 50cbe9ab5f8d ("net: Validate xmit SKBs right when we pull them out of the qdisc") Signed-off-by: Alexander Duyck Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 3774afc3bebf..2f3dbd657570 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2739,7 +2739,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + skb = validate_xmit_skb(skb, dev); + if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; @@ -2879,6 +2880,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) goto recursion_alert; + skb = validate_xmit_skb(skb, dev); + if (!skb) + goto drop; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { @@ -2904,10 +2909,11 @@ recursion_alert: } rc = -ENETDOWN; +drop: rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); - kfree_skb(skb); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); -- cgit v1.2.3 From 2f711939d2ea9dfaecebecd1324d2ec7a7a21f65 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 2 Sep 2014 15:49:25 +0200 Subject: ipv6: add sysctl_mld_qrv to configure query robustness variable This patch adds a new sysctl_mld_qrv knob to configure the mldv1/v2 query robustness variable. It specifies how many retransmit of unsolicited mld retransmit should happen. Admins might want to tune this on lossy links. Also reset mld state on interface down/up, so we pick up new sysctl settings during interface up event. IPv6 certification requests this knob to be available. I didn't make this knob netns specific, as it is mostly a setting in a physical environment and should be per host. Cc: Flavio Leitner Signed-off-by: Hannes Frederic Sowa Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/ipv6.h | 1 + net/ipv6/mcast.c | 25 +++++++++++++++---------- net/ipv6/sysctl_net_ipv6.c | 10 ++++++++++ 4 files changed, 31 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 3cce8ea6b139..cfc71ac0f764 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1152,6 +1152,11 @@ anycast_src_echo_reply - BOOLEAN FALSE: disabled Default: FALSE +mld_qrv - INTEGER + Controls the MLD query robustness variable (see RFC3810 9.1). + Default: 2 (as specified by RFC3810 9.1) + Minimum: 1 (as specified by RFC6636 4.5) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a2db816e8461..7e247e9b8765 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -121,6 +121,7 @@ struct frag_hdr { /* sysctls */ extern int sysctl_mld_max_msf; +extern int sysctl_mld_qrv; #define _DEVINC(net, statname, modifier, idev, field) \ ({ \ diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 70881795da96..64919425f1ab 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -121,6 +121,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; +int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group @@ -1191,15 +1192,16 @@ static void mld_update_qrv(struct inet6_dev *idev, * and SHOULD NOT be one. Catch this here if we ever run * into such a case in future. */ + const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv); WARN_ON(idev->mc_qrv == 0); if (mlh2->mld2q_qrv > 0) idev->mc_qrv = mlh2->mld2q_qrv; - if (unlikely(idev->mc_qrv < 2)) { + if (unlikely(idev->mc_qrv < min_qrv)) { net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", - idev->mc_qrv, MLD_QRV_DEFAULT); - idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_qrv, min_qrv); + idev->mc_qrv = min_qrv; } } @@ -2478,6 +2480,14 @@ void ipv6_mc_down(struct inet6_dev *idev) mld_clear_delrec(idev); } +static void ipv6_mc_reset(struct inet6_dev *idev) +{ + idev->mc_qrv = sysctl_mld_qrv; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + idev->mc_v1_seen = 0; + idev->mc_maxdelay = unsolicited_report_interval(idev); +} /* Device going up */ @@ -2488,6 +2498,7 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ read_lock_bh(&idev->lock); + ipv6_mc_reset(idev); for (i = idev->mc_list; i; i = i->next) igmp6_group_added(i); read_unlock_bh(&idev->lock); @@ -2508,13 +2519,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) (unsigned long)idev); setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); - - idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_qi = MLD_QI_DEFAULT; - idev->mc_qri = MLD_QRI_DEFAULT; - - idev->mc_maxdelay = unsolicited_report_interval(idev); - idev->mc_v1_seen = 0; + ipv6_mc_reset(idev); write_unlock_bh(&idev->lock); } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 0c56c93619e0..c5c10fafcfe2 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,6 +16,8 @@ #include #include +static int one = 1; + static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", @@ -63,6 +65,14 @@ static struct ctl_table ipv6_rotable[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "mld_qrv", + .data = &sysctl_mld_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, { } }; -- cgit v1.2.3 From a9fe8e29945d56f35235a3a0fba99b4cf181d211 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 2 Sep 2014 15:49:26 +0200 Subject: ipv4: implement igmp_qrv sysctl to tune igmp robustness variable As in IPv6 people might increase the igmp query robustness variable to make sure unsolicited state change reports aren't lost on the network. Add and document this new knob to igmp code. RFCs allow tuning this parameter back to first IGMP RFC, so we also use this setting for all counters, including source specific multicast. Also take over sysctl value when upping the interface and don't reuse the last one seen on the interface. Cc: Flavio Leitner Signed-off-by: Hannes Frederic Sowa Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/linux/igmp.h | 1 + net/ipv4/igmp.c | 31 +++++++++++++++---------------- net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 4 files changed, 31 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index cfc71ac0f764..db2383cb1df9 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -844,6 +844,11 @@ igmp_max_memberships - INTEGER conf/all/* is special, changes the settings for all interfaces +igmp_qrv - INTEGER + Controls the IGMP query robustness variable (see RFC2236 8.1). + Default: 2 (as specified by RFC2236 8.1) + Minimum: 1 (as specified by RFC6636 4.5) + log_martians - BOOLEAN Log packets with impossible addresses to kernel log. log_martians for the interface will be enabled if at least one of diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f47550d75f85..2c677afeea47 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -39,6 +39,7 @@ static inline struct igmpv3_query * extern int sysctl_igmp_max_memberships; extern int sysctl_igmp_max_msf; +extern int sysctl_igmp_qrv; struct ip_sf_socklist { unsigned int sl_max; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 890c4258804c..4146153d875d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -117,7 +117,7 @@ #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) #define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Unsolicited_Report_Count 2 +#define IGMP_Query_Robustness_Variable 2 #define IGMP_Initial_Report_Delay (1) @@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev) { if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1322,7 +1319,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->unsolicit_count = sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; + in_dev->mr_qrv = sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_qrv = sysctl_igmp_qrv; +#endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) @@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) */ int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; - +#ifdef CONFIG_IP_MULTICAST +int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 79a007c52558..45d156dacd61 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -450,6 +450,16 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, -- cgit v1.2.3 From c10a19930f286a24f4994c83fe71866877fb1a71 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Sep 2014 10:56:35 +0200 Subject: mac80211: clean up ieee80211_i.h Not sure how the declaration of ieee80211_tdls_peer_del_work landed after the double inclusion protection end. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8fe5433f0d35..78f95387e43f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1917,7 +1917,7 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, size_t extra_ies_len); int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper); - +void ieee80211_tdls_peer_del_work(struct work_struct *wk); extern const struct ethtool_ops ieee80211_ethtool_ops; @@ -1928,4 +1928,3 @@ extern const struct ethtool_ops ieee80211_ethtool_ops; #endif #endif /* IEEE80211_I_H */ -void ieee80211_tdls_peer_del_work(struct work_struct *wk); -- cgit v1.2.3 From d98ad83ee86e523cc00cbf425f456fbd14b4fdc4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Sep 2014 15:24:57 +0300 Subject: mac80211: add Intel Mobile Communications copyright Our legal structure changed at some point (see wikipedia), but we forgot to immediately switch over to the new copyright notice. For files that we have modified in the time since the change, add the proper copyright notice now. Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 1 + net/mac80211/cfg.c | 1 + net/mac80211/debugfs.c | 1 + net/mac80211/debugfs_sta.c | 1 + net/mac80211/ibss.c | 1 + net/mac80211/ieee80211_i.h | 1 + net/mac80211/iface.c | 1 + net/mac80211/key.c | 1 + net/mac80211/main.c | 1 + net/mac80211/mlme.c | 1 + net/mac80211/rx.c | 1 + net/mac80211/scan.c | 1 + net/mac80211/sta_info.c | 1 + net/mac80211/sta_info.h | 1 + net/mac80211/status.c | 1 + net/mac80211/tdls.c | 1 + net/mac80211/tx.c | 1 + net/mac80211/util.c | 1 + net/mac80211/wme.c | 1 + 19 files changed, 19 insertions(+) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c9b2bec8db47..db4975c33861 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4,6 +4,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4d8989b87960..d7f437ef6e7e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2,6 +2,7 @@ * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 0e963bc1ceac..574656174f91 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -3,6 +3,7 @@ * mac80211 debugfs for wireless PHYs * * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * GPLv2 * diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 4a20fb8f1e23..b5e0a5344e8c 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -2,6 +2,7 @@ * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 5f9654d31a8d..56b53571c807 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -6,6 +6,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2009, Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 78f95387e43f..b9d9508a6286 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -3,6 +3,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index bb7288e3c41c..af237223a8cd 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -5,6 +5,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2008, Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 6429d0e1d4a1..f320a04380b9 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2008 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/main.c b/net/mac80211/main.c index e0ab4320a078..0de7c93bf62b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -2,6 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 29fe91d6a094..305e5105aeb1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5,6 +5,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 41eb12c87240..b04ca4049c95 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index a9bb6eb8c3e0..af0d094b2f2f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -6,6 +6,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index e1f957d5935e..215752e8c5b6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 16dc1d414f69..9dc7a9d6b0d2 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,5 +1,6 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/status.c b/net/mac80211/status.c index aa06dcad336e..2800e1c65519 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2008-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f2cb3b6c1871..bcf51e43ccc1 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2014, Intel Corporation + * Copyright 2014 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 925c39f4099e..d1b3c223d6f9 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 725af7a468d2..b444f27a788e 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 6459946f0b74..3b873989992c 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -1,5 +1,6 @@ /* * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as -- cgit v1.2.3 From 2740f0cf8ec8bc7ee6a58f68841759e367dda98f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Sep 2014 15:24:58 +0300 Subject: cfg80211: add Intel Mobile Communications copyright Our legal structure changed at some point (see wikipedia), but we forgot to immediately switch over to the new copyright notice. For files that we have modified in the time since the change, add the proper copyright notice now. Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 1 + net/wireless/chan.c | 1 + net/wireless/core.c | 1 + net/wireless/nl80211.c | 1 + net/wireless/reg.c | 1 + net/wireless/scan.c | 1 + net/wireless/util.c | 1 + 8 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8018c915ee63..77d4f26e0235 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6,6 +6,7 @@ * Copyright (c) 2002-2003, Jouni Malinen * Copyright (c) 2005, Devicescape Software, Inc. * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ab21299c8f4d..6be68bb31918 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4,6 +4,7 @@ * 802.11 device and configuration interface * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 992b34070bcb..72d81e2154d5 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -4,6 +4,7 @@ * any point in time. * * Copyright 2009 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include diff --git a/net/wireless/core.c b/net/wireless/core.c index a207eb116829..9698fe709251 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -2,6 +2,7 @@ * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3011401f52c0..d876146498dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2,6 +2,7 @@ * This is the new netlink-based wireless configuration interface. * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 1afdf45db38f..8bfc9b172a49 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2007 Johannes Berg * Copyright 2008-2011 Luis R. Rodriguez + * Copyright 2013-2014 Intel Mobile Communications GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 620a4b40d466..bda39f149810 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2,6 +2,7 @@ * cfg80211 scan result handling * * Copyright 2008 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include #include diff --git a/net/wireless/util.c b/net/wireless/util.c index 728f1c0dc70d..a8b2816ffcb9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2,6 +2,7 @@ * Wireless utility functions * * Copyright 2007-2009 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include #include -- cgit v1.2.3 From 6188c271f0f2cbc89a52981d252107f7f409f45f Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Wed, 3 Sep 2014 15:24:59 +0300 Subject: mac80211: fix description comment of ieee80211_subif_start_xmit The function description claimed that on error the skb isn't freed even though it is, and stated return values that are different than what really happens in the code. Signed-off-by: Liad Kaufman Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d1b3c223d6f9..2f7754ca59d2 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1789,9 +1789,8 @@ static void ieee80211_tx_latency_start_msrmnt(struct ieee80211_local *local, * @skb: packet to be sent * @dev: incoming interface * - * Returns: 0 on success (and frees skb in this case) or 1 on failure (skb will - * not be freed, and caller is responsible for either retrying later or freeing - * skb). + * Returns: NETDEV_TX_OK both on success and on failure. On failure skb will + * be freed. * * This function takes in an Ethernet header and encapsulates it with suitable * IEEE 802.11 header based on which interface the packet is coming in. The -- cgit v1.2.3 From bab5ab7d2a5466406e8003d038cc7ce6b2d5d804 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 3 Sep 2014 15:25:01 +0300 Subject: nl80211: Add flag attribute for RRM connections Add a flag attribute to use in associations, for tagging the target connection as supporting RRM. It is the responsibility of upper layers to set this flag only if both the underlying device, and the target network indeed support RRM. To be used in ASSOCIATE and CONNECT commands. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 13 +++++++++++++ net/wireless/nl80211.c | 17 +++++++++++++++++ 3 files changed, 32 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6be68bb31918..a887eeb5b31e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1608,10 +1608,12 @@ struct cfg80211_auth_request { * * @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n) * @ASSOC_REQ_DISABLE_VHT: Disable VHT + * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), ASSOC_REQ_DISABLE_VHT = BIT(1), + ASSOC_REQ_USE_RRM = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c166d3d23e55..de69d3df5e55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1594,6 +1594,17 @@ enum nl80211_commands { * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. * + * @NL80211_ATTR_USE_RRM: flag for indicating whether the current connection + * shall support Radio Resource Measurements (11k). This attribute can be + * used with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests. + * User space applications are expected to use this flag only if the + * underlying device supports these minimal RRM features: + * %NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES, + * %NL80211_FEATURE_QUIET, + * If this flag is used, driver must add the Power Capabilities IE to the + * association request. In addition, it must also set the RRM capability + * flag in the association request's Capability Info field. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1936,6 +1947,8 @@ enum nl80211_attrs { NL80211_ATTR_TDLS_INITIATOR, + NL80211_ATTR_USE_RRM, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d876146498dd..459dc2769d0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -389,6 +389,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, + [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -6584,6 +6585,14 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) sizeof(req.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + req.flags |= ASSOC_REQ_USE_RRM; + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); @@ -7241,6 +7250,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + connect.flags |= ASSOC_REQ_USE_RRM; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); -- cgit v1.2.3 From cd2f5dd709daa8a70f9eb408025dbb1c804929a8 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 3 Sep 2014 15:25:02 +0300 Subject: mac80211: Add RRM support to assoc request In case of a RRM-supporting connection, in the association request frame: set the RRM capability flag, and add the required IEs. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b9d9508a6286..6b7bdd8fae29 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -355,6 +355,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_80P80MHZ = BIT(12), IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), + IEEE80211_STA_ENABLE_RRM = BIT(15), }; struct ieee80211_mgd_auth_data { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 305e5105aeb1..27231ac2dbdd 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -673,6 +673,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; + if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) + capab |= WLAN_CAPABILITY_RADIO_MEASURE; + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); memset(mgmt, 0, 24); memcpy(mgmt->da, assoc_data->bss->bssid, ETH_ALEN); @@ -738,16 +741,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) } } - if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) { - /* 1. power capabilities */ + if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT || + capab & WLAN_CAPABILITY_RADIO_MEASURE) { pos = skb_put(skb, 4); *pos++ = WLAN_EID_PWR_CAPABILITY; *pos++ = 2; *pos++ = 0; /* min tx power */ /* max tx power */ *pos++ = ieee80211_chandef_max_power(&chanctx_conf->def); + } - /* 2. supported channels */ + if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) { /* TODO: get this in reg domain format */ pos = skb_put(skb, 2 * sband->n_channels + 2); *pos++ = WLAN_EID_SUPPORTED_CHANNELS; @@ -4410,6 +4414,11 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED; } + if (req->flags & ASSOC_REQ_USE_RRM) + ifmgd->flags |= IEEE80211_STA_ENABLE_RRM; + else + ifmgd->flags &= ~IEEE80211_STA_ENABLE_RRM; + if (req->crypto.control_port) ifmgd->flags |= IEEE80211_STA_CONTROL_PORT; else -- cgit v1.2.3 From a62a1aed3733d7ec6489adca4c2f69881d78cfd6 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 3 Sep 2014 15:25:03 +0300 Subject: cfg80211: avoid duplicate entries on regdomain intersection The regdom intersection code simply tries intersecting each rule of the source with each rule of the target. Since the resulting intersections are not observed as a whole, this can result in multiple overlapping/duplicate entries. Make the rule addition a bit more smarter, by looking for rules that can be contained within other rules, and adding only extended ones. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/reg.c | 81 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 8bfc9b172a49..b725a31a4751 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -799,6 +799,57 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, return 0; } +/* check whether old rule contains new rule */ +static bool rule_contains(struct ieee80211_reg_rule *r1, + struct ieee80211_reg_rule *r2) +{ + /* for simplicity, currently consider only same flags */ + if (r1->flags != r2->flags) + return false; + + /* verify r1 is more restrictive */ + if ((r1->power_rule.max_antenna_gain > + r2->power_rule.max_antenna_gain) || + r1->power_rule.max_eirp > r2->power_rule.max_eirp) + return false; + + /* make sure r2's range is contained within r1 */ + if (r1->freq_range.start_freq_khz > r2->freq_range.start_freq_khz || + r1->freq_range.end_freq_khz < r2->freq_range.end_freq_khz) + return false; + + /* and finally verify that r1.max_bw >= r2.max_bw */ + if (r1->freq_range.max_bandwidth_khz < + r2->freq_range.max_bandwidth_khz) + return false; + + return true; +} + +/* add or extend current rules. do nothing if rule is already contained */ +static void add_rule(struct ieee80211_reg_rule *rule, + struct ieee80211_reg_rule *reg_rules, u32 *n_rules) +{ + struct ieee80211_reg_rule *tmp_rule; + int i; + + for (i = 0; i < *n_rules; i++) { + tmp_rule = ®_rules[i]; + /* rule is already contained - do nothing */ + if (rule_contains(tmp_rule, rule)) + return; + + /* extend rule if possible */ + if (rule_contains(rule, tmp_rule)) { + memcpy(tmp_rule, rule, sizeof(*rule)); + return; + } + } + + memcpy(®_rules[*n_rules], rule, sizeof(*rule)); + (*n_rules)++; +} + /** * regdom_intersect - do the intersection between two regulatory domains * @rd1: first regulatory domain @@ -818,12 +869,10 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, { int r, size_of_regd; unsigned int x, y; - unsigned int num_rules = 0, rule_idx = 0; + unsigned int num_rules = 0; const struct ieee80211_reg_rule *rule1, *rule2; - struct ieee80211_reg_rule *intersected_rule; + struct ieee80211_reg_rule intersected_rule; struct ieee80211_regdomain *rd; - /* This is just a dummy holder to help us count */ - struct ieee80211_reg_rule dummy_rule; if (!rd1 || !rd2) return NULL; @@ -841,7 +890,7 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; if (!reg_rules_intersect(rd1, rd2, rule1, rule2, - &dummy_rule)) + &intersected_rule)) num_rules++; } } @@ -856,34 +905,24 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, if (!rd) return NULL; - for (x = 0; x < rd1->n_reg_rules && rule_idx < num_rules; x++) { + for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; - for (y = 0; y < rd2->n_reg_rules && rule_idx < num_rules; y++) { + for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; - /* - * This time around instead of using the stack lets - * write to the target rule directly saving ourselves - * a memcpy() - */ - intersected_rule = &rd->reg_rules[rule_idx]; r = reg_rules_intersect(rd1, rd2, rule1, rule2, - intersected_rule); + &intersected_rule); /* * No need to memset here the intersected rule here as * we're not using the stack anymore */ if (r) continue; - rule_idx++; - } - } - if (rule_idx != num_rules) { - kfree(rd); - return NULL; + add_rule(&intersected_rule, rd->reg_rules, + &rd->n_reg_rules); + } } - rd->n_reg_rules = num_rules; rd->alpha2[0] = '9'; rd->alpha2[1] = '8'; rd->dfs_region = reg_intersect_dfs_region(rd1->dfs_region, -- cgit v1.2.3 From 24ecd45e2eb194dcadefeb60a16f4ca751402413 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 3 Sep 2014 15:25:05 +0300 Subject: mac80211: adjust roc duration when combining ROCs The new duration (remaining duration after the current ROC ends) was calculated but not used, making the optimization worthless. Signed-off-by: Eliad Peller Reviewed-by: Ilan Peer Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d7f437ef6e7e..4e57b67a91dd 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2496,6 +2496,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (new_dur > 0) { /* add right after tmp */ + roc->duration = new_dur; list_add(&roc->list, &tmp->list); } else { list_add_tail(&roc->list, -- cgit v1.2.3 From eaa336b0f5087addb32217e6a70845ed57249f76 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 3 Sep 2014 15:25:06 +0300 Subject: mac80211: combine roc with the "next roc" if possible If the remaining time in the current roc is not long enough, mac80211 adds the new roc right after it (if they have similar params). However, in case of multiple rocs, the "next roc" is not considered, resulting in multiple rocs, each one with its own duration. Refactor the code a bit and consider the next roc, so a single max roc will be used instead of multiple rocs (which might last much longer). Signed-off-by: Eliad Peller Reviewed-by: Ilan Peer Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 77 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4e57b67a91dd..4c1681bae232 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2352,6 +2352,58 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return 0; } +static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, + struct ieee80211_roc_work *new_roc, + struct ieee80211_roc_work *cur_roc) +{ + unsigned long j = jiffies; + unsigned long cur_roc_end = cur_roc->hw_start_time + + msecs_to_jiffies(cur_roc->duration); + struct ieee80211_roc_work *next_roc; + int new_dur; + + if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun)) + return false; + + if (time_after(j + IEEE80211_ROC_MIN_LEFT, cur_roc_end)) + return false; + + ieee80211_handle_roc_started(new_roc); + + new_dur = new_roc->duration - jiffies_to_msecs(cur_roc_end - j); + + /* cur_roc is long enough - add new_roc to the dependents list. */ + if (new_dur <= 0) { + list_add_tail(&new_roc->list, &cur_roc->dependents); + return true; + } + + new_roc->duration = new_dur; + + /* + * if cur_roc was already coalesced before, we might + * want to extend the next roc instead of adding + * a new one. + */ + next_roc = list_entry(cur_roc->list.next, + struct ieee80211_roc_work, list); + if (&next_roc->list != &local->roc_list && + next_roc->chan == new_roc->chan && + next_roc->sdata == new_roc->sdata && + !WARN_ON(next_roc->started)) { + list_add_tail(&new_roc->list, &next_roc->dependents); + next_roc->duration = max(next_roc->duration, + new_roc->duration); + next_roc->type = max(next_roc->type, new_roc->type); + return true; + } + + /* add right after cur_roc */ + list_add(&new_roc->list, &cur_roc->list); + + return true; +} + static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, @@ -2457,8 +2509,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, /* If it has already started, it's more difficult ... */ if (local->ops->remain_on_channel) { - unsigned long j = jiffies; - /* * In the offloaded ROC case, if it hasn't begun, add * this new one to the dependent list to be handled @@ -2481,29 +2531,8 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, break; } - if (time_before(j + IEEE80211_ROC_MIN_LEFT, - tmp->hw_start_time + - msecs_to_jiffies(tmp->duration))) { - int new_dur; - - ieee80211_handle_roc_started(roc); - - new_dur = roc->duration - - jiffies_to_msecs(tmp->hw_start_time + - msecs_to_jiffies( - tmp->duration) - - j); - - if (new_dur > 0) { - /* add right after tmp */ - roc->duration = new_dur; - list_add(&roc->list, &tmp->list); - } else { - list_add_tail(&roc->list, - &tmp->dependents); - } + if (ieee80211_coalesce_started_roc(local, roc, tmp)) queued = true; - } } else if (del_timer_sync(&tmp->work.timer)) { unsigned long new_end; -- cgit v1.2.3 From 3057dbfdab1b86a77ed6d512fc857b032f78663b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 4 Sep 2014 23:57:40 +0200 Subject: cfg80211: enable dynack through nl80211 Enable ACK timeout estimation algorithm (dynack) using mac80211 set_coverage_class API. Dynack is activated passing coverage class equals to -1 to lower drivers and it is automatically disabled setting valid value for coverage class. Define NL80211_ATTR_WIPHY_DYN_ACK flag attribute to enable dynack from userspace. In order to activate dynack NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower drivers to indicate dynack capability. Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 12 ++++++++++++ net/wireless/nl80211.c | 11 +++++++++++ 3 files changed, 25 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a887eeb5b31e..0d17ec9df692 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1805,6 +1805,7 @@ struct cfg80211_connect_params { * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed + * @WIPHY_PARAM_DYN_ACK: dynack has been enabled */ enum wiphy_params_flags { WIPHY_PARAM_RETRY_SHORT = 1 << 0, @@ -1812,6 +1813,7 @@ enum wiphy_params_flags { WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, + WIPHY_PARAM_DYN_ACK = 1 << 5, }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index de69d3df5e55..29c4399e08b9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1605,6 +1605,12 @@ enum nl80211_commands { * association request. In addition, it must also set the RRM capability * flag in the association request's Capability Info field. * + * @NL80211_ATTR_WIPHY_DYN_ACK: flag attribute used to enable ACK timeout + * estimation algorithm (dynack). In order to activate dynack + * %NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower + * drivers to indicate dynack capability. Dynack is automatically disabled + * setting valid value for coverage class. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1949,6 +1955,8 @@ enum nl80211_attrs { NL80211_ATTR_USE_RRM, + NL80211_ATTR_WIPHY_DYN_ACK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3991,6 +3999,9 @@ enum nl80211_ap_sme_features { * current tx power value into the TPC Report IE in the spectrum * management TPC Report action frame, and in the Radio Measurement Link * Measurement Report action frame. + * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout + * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used + * to enable dynack. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4016,6 +4027,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_WFA_TPC_IE_IN_PROBES = 1 << 20, NL80211_FEATURE_QUIET = 1 << 21, NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, + NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 459dc2769d0e..cf178d2b621d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -226,6 +226,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG }, [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, @@ -2239,11 +2240,21 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) + return -EINVAL; + coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); changed |= WIPHY_PARAM_COVERAGE_CLASS; } + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) + return -EOPNOTSUPP; + + changed |= WIPHY_PARAM_DYN_ACK; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; -- cgit v1.2.3 From a4bcaf5556da649f0160e60fa7b4bb2c29801c12 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 4 Sep 2014 23:57:41 +0200 Subject: mac80211: extend set_coverage_class signature Extend mac80211 set_coverage_class API in order to enable ACK timeout estimation algorithm (dynack) passing coverage class equals to -1 to lower drivers. Synchronize set_coverage_class routine signature with mac80211 function pointer for p54, ath9k, ath9k_htc and ath5k drivers. Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath5k/mac80211-ops.c | 2 +- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 2 +- drivers/net/wireless/ath/ath9k/main.c | 3 ++- drivers/net/wireless/p54/main.c | 3 ++- include/net/mac80211.h | 6 ++++-- net/mac80211/cfg.c | 9 +++++++-- net/mac80211/driver-ops.h | 2 +- net/mac80211/trace.h | 4 ++-- 8 files changed, 20 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c index b65c38fdaa4b..ab2709a43768 100644 --- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c +++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c @@ -704,7 +704,7 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) * reset. */ static void -ath5k_set_coverage_class(struct ieee80211_hw *hw, u8 coverage_class) +ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) { struct ath5k_hw *ah = hw->priv; diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 5627917c5ff7..994fff1ff519 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1722,7 +1722,7 @@ static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, u32 value) } static void ath9k_htc_set_coverage_class(struct ieee80211_hw *hw, - u8 coverage_class) + s16 coverage_class) { struct ath9k_htc_priv *priv = hw->priv; diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index e6ac8d2e610c..97c028e36647 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1959,7 +1959,8 @@ static int ath9k_get_survey(struct ieee80211_hw *hw, int idx, return 0; } -static void ath9k_set_coverage_class(struct ieee80211_hw *hw, u8 coverage_class) +static void ath9k_set_coverage_class(struct ieee80211_hw *hw, + s16 coverage_class) { struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; diff --git a/drivers/net/wireless/p54/main.c b/drivers/net/wireless/p54/main.c index 7be3a4839640..97aeff0edb84 100644 --- a/drivers/net/wireless/p54/main.c +++ b/drivers/net/wireless/p54/main.c @@ -696,7 +696,8 @@ static void p54_flush(struct ieee80211_hw *dev, struct ieee80211_vif *vif, WARN(total, "tx flush timeout, unresponsive firmware"); } -static void p54_set_coverage_class(struct ieee80211_hw *dev, u8 coverage_class) +static void p54_set_coverage_class(struct ieee80211_hw *dev, + s16 coverage_class) { struct p54_common *priv = dev->priv; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index db4975c33861..c6e6a6804ee4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2673,7 +2673,9 @@ enum ieee80211_roc_type { * * @set_coverage_class: Set slot time for given coverage class as specified * in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout - * accordingly. This callback is not required and may sleep. + * accordingly; coverage class equals to -1 to enable ACK timeout + * estimation algorithm (dynack). To disable dynack set valid value for + * coverage class. This callback is not required and may sleep. * * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may * be %NULL. The callback can sleep. @@ -2957,7 +2959,7 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, u8 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4c1681bae232..101ae6cfad81 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1978,8 +1978,13 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) return err; } - if (changed & WIPHY_PARAM_COVERAGE_CLASS) { - err = drv_set_coverage_class(local, wiphy->coverage_class); + if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || + (changed & WIPHY_PARAM_DYN_ACK)) { + s16 coverage_class; + + coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? + wiphy->coverage_class : -1; + err = drv_set_coverage_class(local, coverage_class); if (err) return err; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 11423958116a..196d48c68134 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -450,7 +450,7 @@ static inline int drv_set_rts_threshold(struct ieee80211_local *local, } static inline int drv_set_coverage_class(struct ieee80211_local *local, - u8 value) + s16 value) { int ret = 0; might_sleep(); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 02ac535d1274..38fae7ebe984 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -672,13 +672,13 @@ DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, ); TRACE_EVENT(drv_set_coverage_class, - TP_PROTO(struct ieee80211_local *local, u8 value), + TP_PROTO(struct ieee80211_local *local, s16 value), TP_ARGS(local, value), TP_STRUCT__entry( LOCAL_ENTRY - __field(u8, value) + __field(s16, value) ), TP_fast_assign( -- cgit v1.2.3 From f3000e1b43f164802f2a74c9de6a398943a36378 Mon Sep 17 00:00:00 2001 From: Eyal Shapira Date: Thu, 4 Sep 2014 22:37:55 +0300 Subject: mac80211: fix broken use of VHT/20Mhz with some APs commit "mac80211: disable 40MHz support in case of 20MHz AP" broke working VHT in 20Mhz with APs like Netgear R6300v2 which do not publish support for 40Mhz but allow use of VHT in 20Mhz. The break is because VHT is disabled once no HT cap doesn't indicate support for 40Mhz. This causes the assoc request to be sent without any VHT IE and the association is only HT due to this. For more details check out commit 4a817aa7 "mac80211: allow VHT with peers not capable of 40MHz" Fixes: 53b954ee4a71 ("mac80211: disable 40MHz support in case of 20MHz AP") Signed-off-by: Eyal Shapira Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 27231ac2dbdd..f09bef7ac21d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -173,7 +173,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, if (!(ht_cap->cap_info & cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40))) { - ret = IEEE80211_STA_DISABLE_40MHZ | IEEE80211_STA_DISABLE_VHT; + ret = IEEE80211_STA_DISABLE_40MHZ; goto out; } -- cgit v1.2.3 From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 22:53:44 +0200 Subject: net: bpf: make eBPF interpreter images read-only With eBPF getting more extended and exposure to user space is on it's way, hardening the memory range the interpreter uses to steer its command flow seems appropriate. This patch moves the to be interpreted bytecode to read-only pages. In case we execute a corrupted BPF interpreter image for some reason e.g. caused by an attacker which got past a verifier stage, it would not only provide arbitrary read/write memory access but arbitrary function calls as well. After setting up the BPF interpreter image, its contents do not change until destruction time, thus we can setup the image on immutable made pages in order to mitigate modifications to that code. The idea is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). This is possible because bpf_prog is not part of sk_filter anymore. After setup bpf_prog cannot be altered during its life-time. This prevents any modifications to the entire bpf_prog structure (incl. function/JIT image pointer). Every eBPF program (including classic BPF that are migrated) have to call bpf_prog_select_runtime() to select either interpreter or a JIT image as a last setup step, and they all are being freed via bpf_prog_free(), including non-JIT. Therefore, we can easily integrate this into the eBPF life-time, plus since we directly allocate a bpf_prog, we have no performance penalty. Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual inspection of kernel_page_tables. Brad Spengler proposed the same idea via Twitter during development of this patch. Joint work with Hannes Frederic Sowa. Suggested-by: Brad Spengler Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 3 +- arch/mips/net/bpf_jit.c | 3 +- arch/powerpc/net/bpf_jit_comp.c | 3 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 3 +- arch/x86/net/bpf_jit_comp.c | 18 ++++------ include/linux/filter.h | 49 ++++++++++++++++++++++--- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++++++++++++++-- kernel/seccomp.c | 7 ++-- lib/test_bpf.c | 2 +- net/core/filter.c | 6 ++-- 11 files changed, 144 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a37b989a2f91..a76623bcf722 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -930,5 +930,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 05a56619ece2..cfa83cf2447d 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1427,5 +1427,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 3afa6f4c1957..40c53ff59124 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -697,5 +697,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 61e45b7c04d7..f2833c5b218a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -887,5 +887,5 @@ void bpf_jit_free(struct bpf_prog *fp) module_free(NULL, header); free_filter: - kfree(fp); + bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 1f76c22a6a75..f7a736b645e8 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -812,5 +812,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b08a98c59530..39ccfbb4a723 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -972,23 +972,17 @@ out: kfree(addrs); } -static void bpf_jit_free_deferred(struct work_struct *work) +void bpf_jit_free(struct bpf_prog *fp) { - struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; + if (!fp->jited) + goto free_filter; + set_memory_rw(addr, header->pages); module_free(NULL, header); - kfree(fp); -} -void bpf_jit_free(struct bpf_prog *fp) -{ - if (fp->jited) { - INIT_WORK(&fp->work, bpf_jit_free_deferred); - schedule_work(&fp->work); - } else { - kfree(fp); - } +free_filter: + bpf_prog_unlock_free(fp); } diff --git a/include/linux/filter.h b/include/linux/filter.h index a5227ab8ccb1..c78994593355 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -9,6 +9,11 @@ #include #include #include +#include + +struct sk_buff; +struct sock; +struct seccomp_data; /* Internally used and optimized filter representation with extended * instruction set based on top of classic BPF. @@ -320,20 +325,23 @@ struct sock_fprog_kern { struct sock_filter *filter; }; -struct sk_buff; -struct sock; -struct seccomp_data; +struct bpf_work_struct { + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_prog { + u32 pages; /* Number of allocated pages */ u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); + /* Instructions for interpreter */ union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; - struct work_struct work; }; }; @@ -353,6 +361,26 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ + set_memory_ro((unsigned long)fp, fp->pages); +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ + set_memory_rw((unsigned long)fp, fp->pages); +} +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} +#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ + int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); @@ -361,6 +389,17 @@ void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags); +void __bpf_prog_free(struct bpf_prog *fp); + +static inline void bpf_prog_unlock_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_ro(fp); + __bpf_prog_free(fp); +} + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); @@ -450,7 +489,7 @@ static inline void bpf_jit_compile(struct bpf_prog *fp) static inline void bpf_jit_free(struct bpf_prog *fp) { - kfree(fp); + bpf_prog_unlock_free(fp); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..b54bb2c2e494 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,6 +22,7 @@ */ #include #include +#include #include /* Registers */ @@ -63,6 +64,67 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_work_struct *ws; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); + if (ws == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->work = ws; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->work from fp_old around in the new + * reallocated structure. + */ + fp_old->work = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->work); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. @@ -523,12 +585,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -/* free internal BPF program */ +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_work_struct *ws; + + ws = container_of(work, struct bpf_work_struct, work); + bpf_jit_free(ws->prog); +} + +/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - bpf_jit_free(fp); + struct bpf_work_struct *ws = fp->work; + + INIT_WORK(&ws->work, bpf_prog_free_deferred); + ws->prog = fp; + schedule_work(&ws->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..84922befea84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,16 +395,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(bpf_prog_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; @@ -413,7 +412,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8c66c6aace04..9a67456ba29a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1836,7 +1836,7 @@ static struct bpf_prog *generate_filter(int which, int *err) break; case INTERNAL: - fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(flen), 0); if (fp == NULL) { pr_cont("UNEXPECTED_FAIL no memory left\n"); *err = -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0f..37f8eb06fdee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -933,7 +933,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1013,7 +1013,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,7 +1069,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; -- cgit v1.2.3 From e020836d953eb1ce5b9221b32f4613646a4d5772 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 23:30:05 +0200 Subject: dev_ioctl: remove dev_load() CAP_SYS_MODULE message Marcel reported to see the following message when autoloading is being triggered when adding nlmon device: Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-nlmon instead. This false-positive happens despite with having correct capabilities set, e.g. through issuing `ip link del dev nlmon` more than once on a valid device with name nlmon, but Marcel has also seen it on creation time when no nlmon module is previously compiled-in or loaded as module and the device name equals a link type name (e.g. nlmon, vxlan, team). Stephen says: The netdev module alias is a hold over from the past. For normal devices, people used to create a alias eth0 to and point it to the type of network device used, that was back in the bad old ISA days before real discovery. Also, the tunnels create module alias for the control device and ip used to use this to autoload the tunnel device. The message is bogus and should just be removed, I also see it in a couple of other cases where tap devices are renamed for other usese. As mentioned in 8909c9ad8ff0 ("net: don't allow CAP_NET_ADMIN to load non-netdev kernel modules"), we nevertheless still might want to leave the old autoloading behaviour in place as it could break old scripts, so for now, lets just remove the log message as Stephen suggests. Reference: http://thread.gmane.org/gmane.linux.kernel/1105168 Reported-by: Marcel Holtmann Suggested-by: Stephen Hemminger Signed-off-by: Daniel Borkmann Cc: Vasiliy Kulikov Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index cf999e09bcd2..72e899a3efda 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -365,11 +365,8 @@ void dev_load(struct net *net, const char *name) no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } + if (no_module && capable(CAP_SYS_MODULE)) + request_module("%s", name); } EXPORT_SYMBOL(dev_load); -- cgit v1.2.3 From f0db9b073415848709dd59a6394969882f517da9 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan <_govind@gmx.com> Date: Wed, 3 Sep 2014 03:17:20 +0530 Subject: ethtool: Add generic options for tunables This patch adds new ethtool cmd, ETHTOOL_GTUNABLE & ETHTOOL_STUNABLE for getting tunable values from driver. Add get_tunable and set_tunable to ethtool_ops. Driver implements these functions for getting/setting tunable value. Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 +++ include/uapi/linux/ethtool.h | 28 +++++++++++++++ net/core/ethtool.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) (limited to 'net') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e658229fee39..c1a2d60dfb82 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,6 +257,10 @@ struct ethtool_ops { struct ethtool_eeprom *, u8 *); int (*get_eee)(struct net_device *, struct ethtool_eee *); int (*set_eee)(struct net_device *, struct ethtool_eee *); + int (*get_tunable)(struct net_device *, + const struct ethtool_tunable *, void *); + int (*set_tunable)(struct net_device *, + const struct ethtool_tunable *, const void *); }; diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index e3c7a719c76b..7a364f2f3d3f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -209,6 +209,32 @@ struct ethtool_value { __u32 data; }; +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[0]; +}; + /** * struct ethtool_regs - hardware register dump * @cmd: Command number = %ETHTOOL_GREGS @@ -1152,6 +1178,8 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ #define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793fa..27e61b886520 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1621,6 +1621,80 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1744,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1932,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From 29abe2fda54f8e16ecff5d76d96325f31508d5ba Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Wed, 3 Sep 2014 13:16:54 -0700 Subject: l2tp: fix missing line continuation This syntax error was covered by L2TP_REFCNT_DEBUG not being set by default. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 797c0af71c06..2aa2b6c15f20 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -148,7 +148,7 @@ do { \ atomic_read(&_t->ref_count)); \ l2tp_tunnel_inc_refcount_1(_t); \ } while (0) -#define l2tp_tunnel_dec_refcount(_t) +#define l2tp_tunnel_dec_refcount(_t) \ do { \ pr_debug("l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", \ __func__, __LINE__, (_t)->name, \ -- cgit v1.2.3 From caa415270c732505240bb60171c44a7838c555e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Sep 2014 22:21:56 -0700 Subject: ipv4: fix a race in update_or_create_fnhe() nh_exceptions is effectively used under rcu, but lacks proper barriers. Between kzalloc() and setting of nh->nh_exceptions(), we need a proper memory barrier. Signed-off-by: Eric Dumazet Fixes: 4895c771c7f00 ("ipv4: Add FIB nexthop exceptions.") Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- net/ipv4/fib_semantics.c | 8 +++++--- net/ipv4/route.c | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9922093f575e..f30fd554127e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -87,7 +87,7 @@ struct fib_nh { int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; - struct fnhe_hash_bucket *nh_exceptions; + struct fnhe_hash_bucket __rcu *nh_exceptions; }; /* diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b10cd43a4722..5b6efb3d2308 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -157,9 +157,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) static void free_nh_exceptions(struct fib_nh *nh) { - struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fnhe_hash_bucket *hash; int i; + hash = rcu_dereference_protected(nh->nh_exceptions, 1); + if (!hash) + return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; @@ -205,8 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); - if (nexthop_nh->nh_exceptions) - free_nh_exceptions(nexthop_nh); + free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index eaa4b000c7b4..44b0cbdd76f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -628,12 +628,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, spin_lock_bh(&fnhe_lock); - hash = nh->nh_exceptions; + hash = rcu_dereference(nh->nh_exceptions); if (!hash) { hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; - nh->nh_exceptions = hash; + rcu_assign_pointer(nh->nh_exceptions, hash); } hash += hval; @@ -1242,7 +1242,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { - struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); struct fib_nh_exception *fnhe; u32 hval; -- cgit v1.2.3 From d546c621542df9e45eedc91f35356e887ac63b7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Sep 2014 08:21:31 -0700 Subject: ipv4: harden fnhe_hashfun() Lets make this hash function a bit secure, as ICMP attacks are still in the wild. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- net/ipv4/route.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f30fd554127e..dc9d2a27c315 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -65,7 +65,8 @@ struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; -#define FNHE_HASH_SIZE 2048 +#define FNHE_HASH_SHIFT 11 +#define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 44b0cbdd76f1..234a43e233dc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -596,12 +596,12 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) static inline u32 fnhe_hashfun(__be32 daddr) { + static u32 fnhe_hashrnd __read_mostly; u32 hval; - hval = (__force u32) daddr; - hval ^= (hval >> 11) ^ (hval >> 22); - - return hval & (FNHE_HASH_SIZE - 1); + net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); + hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); + return hash_32(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) -- cgit v1.2.3 From 37846ef0188335e49f2491a5bbf4e0dc7d407ea0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 4 Sep 2014 13:31:10 -0400 Subject: net-timestamp: Merge shared code between phy and regular timestamping This change merges the shared bits that exist between skb_tx_tstamp and skb_complete_tx_timestamp. By doing this we can avoid the two diverging as there were already changes pushed into skb_tx_tstamp that hadn't made it into the other function. In addition this resolves issues with the fact that skb_complete_tx_timestamp was included in linux/skbuff.h even though it was only compiled in if phy timestamping was enabled. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 65 ++++++++++++++++++++++++++++++++----------------- net/core/timestamping.c | 29 ---------------------- 2 files changed, 42 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 53ce536e3d6e..697e696c914b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3511,33 +3511,13 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) } EXPORT_SYMBOL(sock_dequeue_err_skb); -void __skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps, - struct sock *sk, int tstype) +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype) { struct sock_exterr_skb *serr; - struct sk_buff *skb; int err; - if (!sk) - return; - - if (hwtstamps) { - *skb_hwtstamps(orig_skb) = - *hwtstamps; - } else { - /* - * no hardware time stamps available, - * so keep the shared tx_flags and only - * store software time stamp - */ - orig_skb->tstamp = ktime_get_real(); - } - - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; - serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; @@ -3554,6 +3534,45 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (err) kfree_skb(skb); } + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + skb->sk = NULL; + + if (hwtstamps) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + } else { + kfree_skb(skb); + } + + sock_put(sk); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + + if (!sk) + return; + + if (hwtstamps) + *skb_hwtstamps(orig_skb) = *hwtstamps; + else + orig_skb->tstamp = ktime_get_real(); + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + __skb_complete_tx_timestamp(skb, sk, tstype); +} EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, diff --git a/net/core/timestamping.c b/net/core/timestamping.c index a8770391ea5b..f48a59fd8f39 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -63,35 +63,6 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); -void skb_complete_tx_timestamp(struct sk_buff *skb, - struct skb_shared_hwtstamps *hwtstamps) -{ - struct sock *sk = skb->sk; - struct sock_exterr_skb *serr; - int err; - - if (!hwtstamps) { - sock_put(sk); - kfree_skb(skb); - return; - } - - *skb_hwtstamps(skb) = *hwtstamps; - - serr = SKB_EXT_ERR(skb); - memset(serr, 0, sizeof(*serr)); - serr->ee.ee_errno = ENOMSG; - serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - skb->sk = NULL; - - err = sock_queue_err_skb(sk, skb); - - sock_put(sk); - if (err) - kfree_skb(skb); -} -EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); - bool skb_defer_rx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; -- cgit v1.2.3 From 62bccb8cdb69051b95a55ab0c489e3cab261c8ef Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 4 Sep 2014 13:31:35 -0400 Subject: net-timestamp: Make the clone operation stand-alone from phy timestamping The phy timestamping takes a different path than the regular timestamping does in that it will create a clone first so that the packets needing to be timestamped can be placed in a queue, or the context block could be used. In order to support these use cases I am pulling the core of the code out so it can be used in other drivers beyond just phy devices. In addition I have added a destructor named sock_efree which is meant to provide a simple way for dropping the reference to skb exceptions that aren't part of either the receive or send windows for the socket, and I have removed some duplication in spots where this destructor could be used in place of sock_edemux. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 6 +++--- include/linux/skbuff.h | 2 ++ include/net/sock.h | 1 + net/core/skbuff.c | 32 +++++++++++++++++++++++++------- net/core/sock.c | 6 ++++++ net/core/timestamping.c | 14 +++----------- 6 files changed, 40 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index d5991ac46ab0..87648b306551 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -1148,7 +1148,7 @@ static void dp83640_remove(struct phy_device *phydev) kfree_skb(skb); while ((skb = skb_dequeue(&dp83640->tx_queue)) != NULL) - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); clock = dp83640_clock_get(dp83640->clock); @@ -1405,7 +1405,7 @@ static void dp83640_txtstamp(struct phy_device *phydev, case HWTSTAMP_TX_ONESTEP_SYNC: if (is_sync(skb, type)) { - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); return; } /* fall through */ @@ -1416,7 +1416,7 @@ static void dp83640_txtstamp(struct phy_device *phydev, case HWTSTAMP_TX_OFF: default: - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); break; } } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 02529fcad1ac..1cf0cfaef10a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2690,6 +2690,8 @@ static inline ktime_t net_invalid_timestamp(void) return ktime_set(0, 0); } +struct sk_buff *skb_clone_sk(struct sk_buff *skb); + #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index 3fde6130789d..e02be37a3d91 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1574,6 +1574,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, void sock_wfree(struct sk_buff *skb); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); +void sock_efree(struct sk_buff *skb); void sock_edemux(struct sk_buff *skb); int sock_setsockopt(struct socket *sock, int level, int op, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 697e696c914b..a936a401564e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3511,6 +3511,27 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) } EXPORT_SYMBOL(sock_dequeue_err_skb); +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; + } + + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, int tstype) @@ -3540,14 +3561,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, { struct sock *sk = skb->sk; - skb->sk = NULL; + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); - if (hwtstamps) { - *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); - } else { - kfree_skb(skb); - } + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); sock_put(sk); } diff --git a/net/core/sock.c b/net/core/sock.c index f1a638ee93d9..d04005c51724 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1637,6 +1637,12 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index f48a59fd8f39..43d3dd62fcc8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk) return; type = classify(skb); @@ -48,16 +47,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) + clone = skb_clone_sk(skb); + if (!clone) return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } } -- cgit v1.2.3 From 82eabd9eb2ec1603282a2c3f74dfcb6fe0aaea0e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 4 Sep 2014 13:32:11 -0400 Subject: net: merge cases where sock_efree and sock_edemux are the same function Since sock_efree and sock_demux are essentially the same code for non-TCP sockets and the case where CONFIG_INET is not defined we can combine the code or replace the call to sock_edemux in several spots. As a result we can avoid a bit of unnecessary code or code duplication. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++++ net/core/sock.c | 4 ++-- net/ipv4/udp.c | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index e02be37a3d91..ad23e80cb8d3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1575,7 +1575,11 @@ void sock_wfree(struct sk_buff *skb); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); +#else +#define sock_edemux(skb) sock_efree(skb) +#endif int sock_setsockopt(struct socket *sock, int level, int op, char __user *optval, unsigned int optlen); diff --git a/net/core/sock.c b/net/core/sock.c index d04005c51724..69592cb66e3b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1643,18 +1643,18 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; -#ifdef CONFIG_INET if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_put(inet_twsk(sk)); else -#endif sock_put(sk); } EXPORT_SYMBOL(sock_edemux); +#endif kuid_t sock_i_uid(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0da3849fd35b..cd0db5471bb5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1972,7 +1972,7 @@ void udp_v4_early_demux(struct sk_buff *skb) return; skb->sk = sk; - skb->destructor = sock_edemux; + skb->destructor = sock_efree; dst = sk->sk_rx_dst; if (dst) -- cgit v1.2.3 From 56193d1bce2b2759cb4bdcc00cd05544894a0c90 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 5 Sep 2014 19:20:26 -0400 Subject: net: Add function for parsing the header length out of linear ethernet frames This patch updates some of the flow_dissector api so that it can be used to parse the length of ethernet buffers stored in fragments. Most of the changes needed were to __skb_get_poff as it needed to be updated to support sending a linear buffer instead of a skb. I have split __skb_get_poff into two functions, the first is skb_get_poff and it retains the functionality of the original __skb_get_poff. The other function is __skb_get_poff which now works much like __skb_flow_dissect in relation to skb_flow_dissect in that it provides the same functionality but works with just a data buffer and hlen instead of needing an skb. Signed-off-by: Alexander Duyck Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 1 + include/linux/skbuff.h | 4 +++- include/net/flow_keys.h | 2 ++ net/core/filter.c | 2 +- net/core/flow_dissector.c | 46 +++++++++++++++++++++++++++++++-------------- net/ethernet/eth.c | 27 ++++++++++++++++++++++++++ 6 files changed, 66 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 9c5529dc6d07..733980fce8e3 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,6 +29,7 @@ #include #ifdef __KERNEL__ +u32 eth_get_headlen(void *data, unsigned int max_len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1cf0cfaef10a..07c9fdd0c126 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3218,7 +3218,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); -u32 __skb_get_poff(const struct sk_buff *skb); +u32 skb_get_poff(const struct sk_buff *skb); +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen); /** * skb_head_is_locked - Determine if the skb->head is locked down diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 9a03f73c4974..7ee2df083542 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -40,4 +40,6 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } u32 flow_hash_from_keys(struct flow_keys *keys); +unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len, + __be16 protocol); #endif diff --git a/net/core/filter.c b/net/core/filter.c index 37f8eb06fdee..fa5b7d0f77ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,7 +113,7 @@ static unsigned int pkt_type_offset(void) static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { - return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff((struct sk_buff *)(unsigned long) ctx); } static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 12f48ca7a0b0..8560dea58803 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include #include #include +#include /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; @@ -117,6 +118,13 @@ ipv6: flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + /* skip the flow label processing if skb is NULL. The + * assumption here is that if there is no skb we are not + * looking for flow info as much as we are length. + */ + if (!skb) + break; + flow_label = ip6_flowlabel(iph); if (flow_label) { /* Awesome, IPv6 packet has a flow label so we can @@ -165,6 +173,9 @@ ipv6: return false; } } + case htons(ETH_P_FCOE): + flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + /* fall through */ default: return false; } @@ -316,26 +327,18 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(__skb_tx_hash); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) { - struct flow_keys keys; - u32 poff = 0; + u32 poff = keys->thoff; - if (!skb_flow_dissect(skb, &keys)) - return 0; - - poff += keys.thoff; - switch (keys.ip_proto) { + switch (keys->ip_proto) { case IPPROTO_TCP: { const struct tcphdr *tcph; struct tcphdr _tcph; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); + tcph = __skb_header_pointer(skb, poff, sizeof(_tcph), + data, hlen, &_tcph); if (!tcph) return poff; @@ -369,6 +372,21 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } +/* skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 5cebca134585..33a140e15834 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -145,6 +145,33 @@ int eth_rebuild_header(struct sk_buff *skb) } EXPORT_SYMBOL(eth_rebuild_header); +/** + * eth_get_headlen - determine the the length of header for an ethernet frame + * @data: pointer to start of frame + * @len: total length of frame + * + * Make a best effort attempt to pull the length for all of the headers for + * a given frame in a linear buffer. + */ +u32 eth_get_headlen(void *data, unsigned int len) +{ + const struct ethhdr *eth = (const struct ethhdr *)data; + struct flow_keys keys; + + /* this should never happen, but better safe than sorry */ + if (len < sizeof(*eth)) + return len; + + /* parse any remaining L2/L3 headers, check for L4 */ + if (!__skb_flow_dissect(NULL, &keys, data, + eth->h_proto, sizeof(*eth), len)) + return max_t(u32, keys.thoff, sizeof(*eth)); + + /* parse for any L4 headers */ + return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); +} +EXPORT_SYMBOL(eth_get_headlen); + /** * eth_type_trans - determine the packet's protocol ID. * @skb: received socket data -- cgit v1.2.3 From 04317dafd11dd7b0ec19b85f098414abae6ed5f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2014 15:33:32 -0700 Subject: tcp: introduce TCP_SKB_CB(skb)->tcp_tw_isn TCP_SKB_CB(skb)->when has different meaning in output and input paths. In output path, it contains a timestamp. In input path, it contains an ISN, chosen by tcp_timewait_state_process() Lets add a different name to ease code comprehension. Note that 'when' field will disappear in following patch, as skb_mstamp already contains timestamp, the anonymous union will promptly disappear as well. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 590e01a476ac..0cd7d2c65dc0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -698,7 +698,12 @@ struct tcp_skb_cb { } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 when; /* used to compute rtt's */ + union { + /* used in output path */ + __u32 when; /* used to compute rtt's */ + /* used in input path */ + __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ + }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index aba4926ca095..9c8b9f1dcf69 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5906,7 +5906,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->when; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; bool want_cookie = false, fastopen; struct flowi fl; struct tcp_fastopen_cookie foc = { .len = -1 }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 487e2a41667f..02e6cd29ebf1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1649988bd1b6..a058f411d3a6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -232,7 +232,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->when = isn; + TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 29964c3d363c..5b3c70ff7a72 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -738,7 +738,7 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); - if (!TCP_SKB_CB(skb)->when && + if (!TCP_SKB_CB(skb)->tcp_tw_isn && (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { @@ -1412,7 +1412,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; -- cgit v1.2.3 From 7faee5c0d514162853a343d93e4a0b6bb8bfec21 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2014 15:33:33 -0700 Subject: tcp: remove TCP_SKB_CB(skb)->when After commit 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution"), we no longer need to maintain timestamps in two different fields. TCP_SKB_CB(skb)->when can be removed, as same information sits in skb_mstamp.stamp_jiffies Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++------ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 39 ++++++++++++++++----------------------- net/ipv4/tcp_timer.c | 7 +++---- 5 files changed, 31 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0cd7d2c65dc0..a4201ef216e8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,6 +672,12 @@ void tcp_send_window_probe(struct sock *sk); */ #define tcp_time_stamp ((__u32)(jiffies)) +static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) +{ + return skb->skb_mstamp.stamp_jiffies; +} + + #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 @@ -698,12 +704,7 @@ struct tcp_skb_cb { } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - union { - /* used in output path */ - __u32 when; /* used to compute rtt's */ - /* used in input path */ - __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ - }; + __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c8b9f1dcf69..f97003ad0af5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2967,7 +2967,8 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + const u32 rto_time_stamp = + tcp_skb_timestamp(skb) + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02e6cd29ebf1..3f9bc3f0bba0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -437,8 +437,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); - remaining = icsk->icsk_rto - min(icsk->icsk_rto, - tcp_time_stamp - TCP_SKB_CB(skb)->when); + remaining = icsk->icsk_rto - + min(icsk->icsk_rto, + tcp_time_stamp - tcp_skb_timestamp(skb)); if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5a7c41fbc6d3..3b22dcb7bb5c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -550,7 +550,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; + opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -618,7 +618,7 @@ static unsigned int tcp_synack_options(struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = tcp_skb_timestamp(skb); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,7 +647,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { - struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; @@ -666,7 +665,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -886,8 +885,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; - /* Our usage of tstamp should remain private */ - skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -975,7 +972,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + if (likely(err <= 0)) return err; @@ -1149,7 +1149,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ - TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1874,8 +1873,8 @@ static int tcp_mtu_probe(struct sock *sk) tcp_init_tso_segs(sk, nskb, nskb->len); /* We're ready to send. If this fails, the probe will - * be resegmented into mss-sized pieces by tcp_write_xmit(). */ - TCP_SKB_CB(nskb)->when = tcp_time_stamp; + * be resegmented into mss-sized pieces by tcp_write_xmit(). + */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ @@ -1935,8 +1934,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "when" is used as a start point for the retransmit timer */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* "skb_mstamp" is used as a start point for the retransmit timer */ + skb_mstamp_get(&skb->skb_mstamp); goto repair; /* Skip network transmission */ } @@ -2000,8 +1999,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2499,7 +2496,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -2544,7 +2540,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = TCP_SKB_CB(skb)->when; + tp->retrans_stamp = tcp_skb_timestamp(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). @@ -2752,7 +2748,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -2791,7 +2786,6 @@ int tcp_send_synack(struct sock *sk) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } - TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2835,10 +2829,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, foc) + sizeof(*th); @@ -3086,7 +3080,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); @@ -3194,7 +3188,7 @@ void tcp_send_ack(struct sock *sk) tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } @@ -3226,7 +3220,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3270,7 +3264,6 @@ int tcp_write_wakeup(struct sock *sk) tcp_set_skb_tso_segs(sk, skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index df90cd1ce37f..a339e7ba05a4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - if (unlikely(!tcp_sk(sk)->retrans_stamp)) - start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; - else - start_ts = tcp_sk(sk)->retrans_stamp; + start_ts = tcp_sk(sk)->retrans_stamp; + if (unlikely(!start_ts)) + start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); -- cgit v1.2.3 From 87d943085b76c6f07807dbc9fde2aad88e828590 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 6 Sep 2014 11:18:20 -0400 Subject: tcp: remove obsolete comment about TCP_SKB_CB(skb)->when in tcp_fragment() The TCP_SKB_CB(skb)->when field no longer exists as of recent change 7faee5c0d514 ("tcp: remove TCP_SKB_CB(skb)->when"). And in any case, tcp_fragment() is called on already-transmitted packets from the __tcp_retransmit_skb() call site, so copying timestamps of any kind in this spot is quite sensible. Signed-off-by: Neal Cardwell Reported-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3b22dcb7bb5c..7f1280dcad57 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1146,9 +1146,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, buff->ip_summed = skb->ip_summed; - /* Looks stupid, but our code really uses when of - * skbs, which it never sent before. --ANK - */ buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); -- cgit v1.2.3 From 13aa3463e574d2ae2618306dcd3e973c5eed6d7f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Sep 2014 18:32:18 +0300 Subject: rose: use %*ph specifier Instead of dereference each byte let's use %*ph specifier in the printk() calls. Signed-off-by: Andy Shevchenko Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/rose/rose_link.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index bc5514211b0c..e873d7d9f857 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -160,7 +160,8 @@ void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigne break; case ROSE_DIAGNOSTIC: - printk(KERN_WARNING "ROSE: received diagnostic #%d - %02X %02X %02X\n", skb->data[3], skb->data[4], skb->data[5], skb->data[6]); + pr_warn("ROSE: received diagnostic #%d - %3ph\n", skb->data[3], + skb->data + 4); break; default: -- cgit v1.2.3 From 24a4e4008ca2a819c4c889163586a8a9b7a3a08d Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Wed, 3 Sep 2014 06:22:10 -0700 Subject: mac80211: split 802.11h parsing from transmit power policy Decouple the logic of parsing the 802.11d and 802.11h IEs from the part of deciding what to do about the data (messaging, clamping to 0 dBm, doing the actual setting). This paves the way for the next patch, which introduces more data sources for transmit power limitation. Signed-off-by: Steinar H. Gunderson Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 60 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f09bef7ac21d..0b812a88f95f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1171,19 +1171,21 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, TU_TO_EXP_TIME(csa_ie.count * cbss->beacon_interval)); } -static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel *channel, - const u8 *country_ie, u8 country_ie_len, - const u8 *pwr_constr_elem) +static bool +ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + const u8 *country_ie, u8 country_ie_len, + const u8 *pwr_constr_elem, + int *chan_pwr, int *pwr_reduction) { struct ieee80211_country_ie_triplet *triplet; int chan = ieee80211_frequency_to_channel(channel->center_freq); - int i, chan_pwr, chan_increment, new_ap_level; + int i, chan_increment; bool have_chan_pwr = false; /* Invalid IE */ if (country_ie_len % 2 || country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) - return 0; + return false; triplet = (void *)(country_ie + 3); country_ie_len -= 3; @@ -1211,7 +1213,7 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, for (i = 0; i < triplet->chans.num_channels; i++) { if (first_channel + i * chan_increment == chan) { have_chan_pwr = true; - chan_pwr = triplet->chans.max_power; + *chan_pwr = triplet->chans.max_power; break; } } @@ -1223,18 +1225,41 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, country_ie_len -= 3; } - if (!have_chan_pwr) - return 0; + if (have_chan_pwr) + *pwr_reduction = *pwr_constr_elem; + return have_chan_pwr; +} + +static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + struct ieee80211_mgmt *mgmt, + const u8 *country_ie, u8 country_ie_len, + const u8 *pwr_constr_ie) +{ + bool has_80211h_pwr = false; + int chan_pwr, pwr_reduction_80211h; + int new_ap_level; - new_ap_level = max_t(int, 0, chan_pwr - *pwr_constr_elem); + if (country_ie && pwr_constr_ie && + mgmt->u.probe_resp.capab_info & + cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT)) { + has_80211h_pwr = ieee80211_find_80211h_pwr_constr( + sdata, channel, country_ie, country_ie_len, + pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); + new_ap_level = max_t(int, 0, chan_pwr - pwr_reduction_80211h); + } - if (sdata->ap_power_level == new_ap_level) + if (!has_80211h_pwr) return 0; sdata_info(sdata, "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", - new_ap_level, chan_pwr, *pwr_constr_elem, + new_ap_level, chan_pwr, pwr_reduction_80211h, sdata->u.mgd.bssid); + + if (sdata->ap_power_level == new_ap_level) + return 0; + sdata->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(sdata)) return BSS_CHANGED_TXPOWER; @@ -3204,13 +3229,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, rx_status->band, true); mutex_unlock(&local->sta_mtx); - if (elems.country_elem && elems.pwr_constr_elem && - mgmt->u.probe_resp.capab_info & - cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT)) - changed |= ieee80211_handle_pwr_constr(sdata, chan, - elems.country_elem, - elems.country_elem_len, - elems.pwr_constr_elem); + changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, + elems.country_elem, + elems.country_elem_len, + elems.pwr_constr_elem); ieee80211_bss_info_change_notify(sdata, changed); } -- cgit v1.2.3 From c8d6591752e96c550cb98b781326d72d8eedcc79 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Wed, 3 Sep 2014 06:48:37 -0700 Subject: mac80211: support DTPC IE (from Cisco Client eXtensions) Linux already supports 802.11h, where the access point can tell the client to reduce its transmission power. However, 802.11h is only defined for 5 GHz, where the need for this is much smaller than on 2.4 GHz. Cisco has their own solution, called DTPC (Dynamic Transmit Power Control). Cisco APs on a controller sometimes but not always send 802.11h; they always send DTPC, even on 2.4 GHz. This patch adds support for parsing and honoring the DTPC IE in addition to the 802.11h element (they do not always contain the same limits, so both must be honored); the format is not documented, but very simple. Tested (on top of wireless.git and on 3.16.1) against a Cisco Aironet 1142 joined to a Cisco 2504 WLC, by setting various transmit power levels for the given access points and observing the results. The Wireshark 802.11 dissector agrees with the interpretation of the element, except for negative numbers, which seem to never happen anyway. Signed-off-by: Steinar H. Gunderson Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 ++- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 60 +++++++++++++++++++++++++++++++++++++--------- net/mac80211/util.c | 25 +++++++++++++++++++ 4 files changed, 77 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 77d4f26e0235..61a09f0644aa 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1824,7 +1824,8 @@ enum ieee80211_eid { WLAN_EID_DMG_TSPEC = 146, WLAN_EID_DMG_AT = 147, WLAN_EID_DMG_CAP = 148, - /* 149-150 reserved for Cisco */ + /* 149 reserved for Cisco */ + WLAN_EID_CISCO_VENDOR_SPECIFIC = 150, WLAN_EID_DMG_OPERATION = 151, WLAN_EID_DMG_BSS_PARAM_CHANGE = 152, WLAN_EID_DMG_BEAM_REFINEMENT = 153, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6b7bdd8fae29..c2aaec4dfcf0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1369,6 +1369,7 @@ struct ieee802_11_elems { const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *country_elem; const u8 *pwr_constr_elem; + const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0b812a88f95f..a7b92f5f7161 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1230,14 +1230,30 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, return have_chan_pwr; } +static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + const u8 *cisco_dtpc_ie, + int *pwr_level) +{ + /* From practical testing, the first data byte of the DTPC element + * seems to contain the requested dBm level, and the CLI on Cisco + * APs clearly state the range is -127 to 127 dBm, which indicates + * a signed byte, although it seemingly never actually goes negative. + * The other byte seems to always be zero. + */ + *pwr_level = (__s8)cisco_dtpc_ie[4]; +} + static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, - const u8 *pwr_constr_ie) + const u8 *pwr_constr_ie, + const u8 *cisco_dtpc_ie) { - bool has_80211h_pwr = false; - int chan_pwr, pwr_reduction_80211h; + bool has_80211h_pwr = false, has_cisco_pwr = false; + int chan_pwr = 0, pwr_reduction_80211h = 0; + int pwr_level_cisco, pwr_level_80211h; int new_ap_level; if (country_ie && pwr_constr_ie && @@ -1246,16 +1262,35 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, has_80211h_pwr = ieee80211_find_80211h_pwr_constr( sdata, channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); - new_ap_level = max_t(int, 0, chan_pwr - pwr_reduction_80211h); + pwr_level_80211h = + max_t(int, 0, chan_pwr - pwr_reduction_80211h); + } + + if (cisco_dtpc_ie) { + ieee80211_find_cisco_dtpc( + sdata, channel, cisco_dtpc_ie, &pwr_level_cisco); + has_cisco_pwr = true; } - if (!has_80211h_pwr) + if (!has_80211h_pwr && !has_cisco_pwr) return 0; - sdata_info(sdata, - "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", - new_ap_level, chan_pwr, pwr_reduction_80211h, - sdata->u.mgd.bssid); + /* If we have both 802.11h and Cisco DTPC, apply both limits + * by picking the smallest of the two power levels advertised. + */ + if (has_80211h_pwr && + (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { + sdata_info(sdata, + "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", + pwr_level_80211h, chan_pwr, pwr_reduction_80211h, + sdata->u.mgd.bssid); + new_ap_level = pwr_level_80211h; + } else { /* has_cisco_pwr is always true here. */ + sdata_info(sdata, + "Limiting TX power to %d dBm as advertised by %pM\n", + pwr_level_cisco, sdata->u.mgd.bssid); + new_ap_level = pwr_level_cisco; + } if (sdata->ap_power_level == new_ap_level) return 0; @@ -2923,7 +2958,9 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI - * (00:50:F2) vendor IE which is used for WMM which we need to track. + * (00:50:F2) vendor IE which is used for WMM which we need to track, + * as well as the DTPC IE (part of the Cisco OUI) used for signaling + * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace @@ -3232,7 +3269,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, elems.country_elem, elems.country_elem_len, - elems.pwr_constr_elem); + elems.pwr_constr_elem, + elems.cisco_dtpc_elem); ieee80211_bss_info_change_notify(sdata, changed); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b444f27a788e..3c61060a4d2b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1015,6 +1015,31 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } elems->pwr_constr_elem = pos; break; + case WLAN_EID_CISCO_VENDOR_SPECIFIC: + /* Lots of different options exist, but we only care + * about the Dynamic Transmit Power Control element. + * First check for the Cisco OUI, then for the DTPC + * tag (0x00). + */ + if (elen < 4) { + elem_parse_failed = true; + break; + } + + if (pos[0] != 0x00 || pos[1] != 0x40 || + pos[2] != 0x96 || pos[3] != 0x00) + break; + + if (elen != 6) { + elem_parse_failed = true; + break; + } + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + elems->cisco_dtpc_elem = pos; + break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; -- cgit v1.2.3 From b1e9be8775b85d761cdb91386200a04d741f6a0d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 8 Sep 2014 11:22:42 +0200 Subject: mac80211: annotate MMIC head/tailroom warning This message occasionally triggers for some people as in https://bugzilla.redhat.com/show_bug.cgi?id=1111740 but it's not clear which (headroom or tailroom) is at fault. Annotate the message a bit to get more information. Signed-off-by: Johannes Berg --- net/mac80211/wpa.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index f7d4ca4c46e0..983527a4c1ab 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -64,8 +64,11 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) if (!info->control.hw_key) tail += IEEE80211_TKIP_ICV_LEN; - if (WARN_ON(skb_tailroom(skb) < tail || - skb_headroom(skb) < IEEE80211_TKIP_IV_LEN)) + if (WARN(skb_tailroom(skb) < tail || + skb_headroom(skb) < IEEE80211_TKIP_IV_LEN, + "mmic: not enough head/tail (%d/%d,%d/%d)\n", + skb_headroom(skb), IEEE80211_TKIP_IV_LEN, + skb_tailroom(skb), tail)) return TX_DROP; key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; -- cgit v1.2.3 From 72c6fb915ff2d30ae14053edee4f0d30019bad76 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:51 +0300 Subject: Bluetooth: Fix incorrect LE CoC PDU length restriction based on HCI MTU The l2cap_create_le_flowctl_pdu() function that l2cap_segment_le_sdu() calls is perfectly capable of doing packet fragmentation if given bigger PDUs than the HCI buffers allow. Forcing the PDU length based on the HCI MTU (conn->mtu) would therefore needlessly strict operation on hardware with limited LE buffers (e.g. both Intel and Broadcom seem to have this set to just 27 bytes). This patch removes the restriction and makes it possible to send PDUs of the full length that the remote MPS value allows. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/l2cap_core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 4a90438d99df..52b56808d5d3 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2358,12 +2358,8 @@ static int l2cap_segment_le_sdu(struct l2cap_chan *chan, BT_DBG("chan %p, msg %p, len %zu", chan, msg, len); - pdu_len = chan->conn->mtu - L2CAP_HDR_SIZE; - - pdu_len = min_t(size_t, pdu_len, chan->remote_mps); - sdu_len = len; - pdu_len -= L2CAP_SDULEN_SIZE; + pdu_len = chan->remote_mps - L2CAP_SDULEN_SIZE; while (len > 0) { if (len <= pdu_len) -- cgit v1.2.3 From b3ed6c63f7d4a51b01a61b10e53a2992ad26aa78 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:52 +0300 Subject: Bluetooth: Remove unnecessary l2cap_chan_unlock before l2cap_chan_add The l2cap_chan_add() function doesn't require the channel to be unlocked. It only requires the l2cap_conn to be unlocked. Therefore, it's unnecessary to unlock a channel before calling l2cap_chan_add(). This patch removes such unnecessary unlocking from the l2cap_chan_connect() function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 52b56808d5d3..777c41dbfdbe 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7078,9 +7078,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bacpy(&chan->src, &hcon->src); chan->src_type = bdaddr_type(hcon, hcon->src_type); - l2cap_chan_unlock(chan); l2cap_chan_add(conn, chan); - l2cap_chan_lock(chan); /* l2cap_chan_add takes its own ref so we can drop this one */ hci_conn_drop(hcon); -- cgit v1.2.3 From c16900cf285ca240f0f84117bf8b88a03c55469b Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:17:06 +0300 Subject: Bluetooth: Fix hci_conn reference counting for fixed channels Now that SMP has been converted to use fixed channels we've got a bit of a problem with the hci_conn reference counting. So far the L2CAP code has kept a reference for each L2CAP channel that was notified of the connection. With SMP however this would mean that the connection is never dropped even though there are no other users of it. Furthermore, SMP already does its own hci_conn reference counting internally, starting from a security or pairing request and ending with the key distribution. This patch makes L2CAP fixed channels default to the L2CAP core not keeping a hci_conn reference for them. A new FLAG_HOLD_HCI_CONN flag is added so that L2CAP users can declare an exception to this rule and hold a reference even for their fixed channels. One such exception is the L2CAP socket layer which does want a reference for each socket (e.g. an ATT socket which uses a fixed channel). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_core.c | 12 ++++++++++-- net/bluetooth/l2cap_sock.c | 8 ++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index cedda399f9c0..1558eccb19db 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -711,6 +711,7 @@ enum { FLAG_DEFER_SETUP, FLAG_LE_CONN_REQ_SENT, FLAG_PENDING_SECURITY, + FLAG_HOLD_HCI_CONN, }; enum { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 777c41dbfdbe..a6559225bb50 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -546,7 +546,10 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) l2cap_chan_hold(chan); - hci_conn_hold(conn->hcon); + /* Only keep a reference for fixed channels if they requested it */ + if (chan->chan_type != L2CAP_CHAN_FIXED || + test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) + hci_conn_hold(conn->hcon); list_add(&chan->list, &conn->chan_l); } @@ -577,7 +580,12 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) chan->conn = NULL; - if (chan->scid != L2CAP_CID_A2MP) + /* Reference was only held for non-fixed channels or + * fixed channels that explicitly requested it using the + * FLAG_HOLD_HCI_CONN flag. + */ + if (chan->chan_type != L2CAP_CHAN_FIXED || + test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_drop(conn->hcon); if (mgr && mgr->bredr_chan == chan) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ed06f88e6f10..31f106e61ca2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -146,6 +146,14 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) case L2CAP_CHAN_RAW: chan->sec_level = BT_SECURITY_SDP; break; + case L2CAP_CHAN_FIXED: + /* Fixed channels default to the L2CAP core not holding a + * hci_conn reference for them. For fixed channels mapping to + * L2CAP sockets we do want to hold a reference so set the + * appropriate flag to request it. + */ + set_bit(FLAG_HOLD_HCI_CONN, &chan->flags); + break; } bacpy(&chan->src, &la.l2_bdaddr); -- cgit v1.2.3 From 08853f18eafe65aa97deb464c28505a67c898b0e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:55 +0300 Subject: Bluetooth: Set addr_type only when it's needed In the hci_le_conn_complete_evt() function there's no need to set the addr_type value until it's actually needed, i.e. for the black list lookup. This patch moves the code a bit further down in the function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3a99f30a3317..e8f35a90add5 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4193,16 +4193,16 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->dst_type = irk->addr_type; } - if (conn->dst_type == ADDR_LE_DEV_PUBLIC) - addr_type = BDADDR_LE_PUBLIC; - else - addr_type = BDADDR_LE_RANDOM; - if (ev->status) { hci_le_conn_failed(conn, ev->status); goto unlock; } + if (conn->dst_type == ADDR_LE_DEV_PUBLIC) + addr_type = BDADDR_LE_PUBLIC; + else + addr_type = BDADDR_LE_RANDOM; + /* Drop the connection if the device is blocked */ if (hci_bdaddr_list_lookup(&hdev->blacklist, &conn->dst, addr_type)) { hci_conn_drop(conn); -- cgit v1.2.3 From 5477610fc187d4aae0f699d21dfb0e3f440f7de7 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:56 +0300 Subject: Bluetooth: Optimize connection parameter lookup for LE connections When we get an LE connection complete event there's really no reason to look through the entire connection parameter list as the entry should be present in the hdev->pend_le_conns list too. This patch changes the lookup code to do a more restricted lookup only in the pend_le_conns list. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e8f35a90add5..d2ee162ecddb 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4225,7 +4225,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_proto_connect_cfm(conn, ev->status); - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, + conn->dst_type); if (params) { list_del_init(¶ms->action); if (params->conn) { -- cgit v1.2.3 From 51bb8457ddfa74ede52bf8c02054dea831d59fff Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:57 +0300 Subject: Bluetooth: Improve *_get() functions to return the object type It's natural to have *_get() functions that increment the reference count of an object to return the object type itself. This way it's simple to make a copy of the object pointer and increase the reference count in a single step. This patch updates two such get() functions, namely hci_conn_get() and l2cap_conn_get(), and updates the users to take advantage of the new API. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 ++- include/net/bluetooth/l2cap.h | 2 +- net/bluetooth/hidp/core.c | 10 ++++------ net/bluetooth/l2cap_core.c | 6 +++--- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b0ded1333865..aa75eee8ad7f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -756,9 +756,10 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status); * _get()/_drop() in it, but require the caller to have a valid ref (FIXME). */ -static inline void hci_conn_get(struct hci_conn *conn) +static inline struct hci_conn *hci_conn_get(struct hci_conn *conn) { get_device(&conn->dev); + return conn; } static inline void hci_conn_put(struct hci_conn *conn) diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 1558eccb19db..8f1652ed3326 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -948,7 +948,7 @@ void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); void l2cap_conn_shutdown(struct l2cap_conn *conn, int err); -void l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 6c7ecf116e74..1b7d605706aa 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -915,7 +915,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, /* connection management */ bacpy(&session->bdaddr, bdaddr); - session->conn = conn; + session->conn = l2cap_conn_get(conn); session->user.probe = hidp_session_probe; session->user.remove = hidp_session_remove; session->ctrl_sock = ctrl_sock; @@ -941,13 +941,13 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, if (ret) goto err_free; - l2cap_conn_get(session->conn); get_file(session->intr_sock->file); get_file(session->ctrl_sock->file); *out = session; return 0; err_free: + l2cap_conn_put(session->conn); kfree(session); return ret; } @@ -1327,10 +1327,8 @@ int hidp_connection_add(struct hidp_connadd_req *req, conn = NULL; l2cap_chan_lock(chan); - if (chan->conn) { - l2cap_conn_get(chan->conn); - conn = chan->conn; - } + if (chan->conn) + conn = l2cap_conn_get(chan->conn); l2cap_chan_unlock(chan); if (!conn) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a6559225bb50..cb36169ef300 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1695,9 +1695,10 @@ static void l2cap_conn_free(struct kref *ref) kfree(conn); } -void l2cap_conn_get(struct l2cap_conn *conn) +struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn) { kref_get(&conn->ref); + return conn; } EXPORT_SYMBOL(l2cap_conn_get); @@ -6908,8 +6909,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) kref_init(&conn->ref); hcon->l2cap_data = conn; - conn->hcon = hcon; - hci_conn_get(conn->hcon); + conn->hcon = hci_conn_get(hcon); conn->hchan = hchan; BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); -- cgit v1.2.3 From f8aaf9b65a77267f749c1af641e46c3457d50701 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sun, 17 Aug 2014 23:28:57 +0300 Subject: Bluetooth: Fix using hci_conn_get() for hci_conn pointers Wherever we keep hci_conn pointers around we should be using hci_conn_get/put to ensure that they stay valid. This patch fixes all places violating against the principle currently. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 1 + net/bluetooth/hci_core.c | 9 +++++++-- net/bluetooth/hci_event.c | 3 ++- net/bluetooth/mgmt.c | 12 ++++++++---- 4 files changed, 18 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index faff6247ac8f..4ecc9d5fce7a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -595,6 +595,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) conn->dst_type); if (params && params->conn) { hci_conn_drop(params->conn); + hci_conn_put(params->conn); params->conn = NULL; } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9b7145959a49..ed60d37ea646 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2541,6 +2541,7 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) list_for_each_entry(p, &hdev->le_conn_params, list) { if (p->conn) { hci_conn_drop(p->conn); + hci_conn_put(p->conn); p->conn = NULL; } list_del_init(&p->action); @@ -3734,8 +3735,10 @@ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) if (!params) return; - if (params->conn) + if (params->conn) { hci_conn_drop(params->conn); + hci_conn_put(params->conn); + } list_del(¶ms->action); list_del(¶ms->list); @@ -3767,8 +3770,10 @@ void hci_conn_params_clear_all(struct hci_dev *hdev) struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { - if (params->conn) + if (params->conn) { hci_conn_drop(params->conn); + hci_conn_put(params->conn); + } list_del(¶ms->action); list_del(¶ms->list); kfree(params); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d2ee162ecddb..e6a496ae0318 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4231,6 +4231,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) list_del_init(¶ms->action); if (params->conn) { hci_conn_drop(params->conn); + hci_conn_put(params->conn); params->conn = NULL; } } @@ -4322,7 +4323,7 @@ static void check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, * the parameters get removed and keep the reference * count consistent once the connection is established. */ - params->conn = conn; + params->conn = hci_conn_get(conn); return; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c2457435a670..d8c66663ade8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3063,6 +3063,7 @@ static void pairing_complete(struct pending_cmd *cmd, u8 status) conn->disconn_cfm_cb = NULL; hci_conn_drop(conn); + hci_conn_put(conn); mgmt_pending_remove(cmd); } @@ -3212,7 +3213,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, } conn->io_capability = cp->io_cap; - cmd->user_data = conn; + cmd->user_data = hci_conn_get(conn); if ((conn->state == BT_CONNECTED || conn->state == BT_CONFIG) && hci_conn_security(conn, sec_level, auth_type, true)) @@ -4914,6 +4915,7 @@ static void get_conn_info_complete(struct pending_cmd *cmd, void *data) match->mgmt_status, &rp, sizeof(rp)); hci_conn_drop(conn); + hci_conn_put(conn); mgmt_pending_remove(cmd); } @@ -5070,7 +5072,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, } hci_conn_hold(conn); - cmd->user_data = conn; + cmd->user_data = hci_conn_get(conn); conn->conn_info_timestamp = jiffies; } else { @@ -5134,8 +5136,10 @@ send_rsp: cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_remove(cmd); - if (conn) + if (conn) { hci_conn_drop(conn); + hci_conn_put(conn); + } unlock: hci_dev_unlock(hdev); @@ -5198,7 +5202,7 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, if (conn) { hci_conn_hold(conn); - cmd->user_data = conn; + cmd->user_data = hci_conn_get(conn); hci_cp.handle = cpu_to_le16(conn->handle); hci_cp.which = 0x01; /* Piconet clock */ -- cgit v1.2.3 From f6c63249698aaa87399e795adcf3b70171384dc2 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 15 Aug 2014 21:06:59 +0300 Subject: Bluetooth: Refactor connection parameter freeing into its own function The necessary steps for freeing connection paramaters have grown quite a bit so we can simplify the code by factoring it out into its own function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ed60d37ea646..0d3782ad9a5b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3726,15 +3726,8 @@ int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, return 0; } -/* This function requires the caller holds hdev->lock */ -void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) +static void hci_conn_params_free(struct hci_conn_params *params) { - struct hci_conn_params *params; - - params = hci_conn_params_lookup(hdev, addr, addr_type); - if (!params) - return; - if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); @@ -3743,6 +3736,18 @@ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) list_del(¶ms->action); list_del(¶ms->list); kfree(params); +} + +/* This function requires the caller holds hdev->lock */ +void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) +{ + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, addr, addr_type); + if (!params) + return; + + hci_conn_params_free(params); hci_update_background_scan(hdev); @@ -3769,15 +3774,8 @@ void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; - list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { - if (params->conn) { - hci_conn_drop(params->conn); - hci_conn_put(params->conn); - } - list_del(¶ms->action); - list_del(¶ms->list); - kfree(params); - } + list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) + hci_conn_params_free(params); hci_update_background_scan(hdev); -- cgit v1.2.3 From 6c388d32ec1b9fcc2f2404fb5e9b3b0096be5de9 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 00:41:42 +0300 Subject: Bluetooth: Fix hci_conn reference counting with hci_chan The hci_chan_del() function was doing a hci_conn_drop() but there was no matching hci_conn_hold() in the hci_chan_create() function. Furthermore, as the hci_chan struct holds a pointer to the hci_conn there should be proper use of hci_conn_get/put. This patch fixes both issues so that hci_chan does correct reference counting of the hci_conn object. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 4ecc9d5fce7a..7815826a48e4 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1295,7 +1295,8 @@ struct hci_chan *hci_chan_create(struct hci_conn *conn) if (!chan) return NULL; - chan->conn = conn; + chan->conn = hci_conn_get(conn); + hci_conn_hold(conn); skb_queue_head_init(&chan->data_q); chan->state = BT_CONNECTED; @@ -1316,6 +1317,7 @@ void hci_chan_del(struct hci_chan *chan) synchronize_rcu(); hci_conn_drop(conn); + hci_conn_put(conn); skb_queue_purge(&chan->data_q); kfree(chan); -- cgit v1.2.3 From b3ff670a44cc34c01e78900c42255511e4f232e6 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 00:41:43 +0300 Subject: Bluetooth: Set disc_timeout to 0 when calling hci_chan_del The hci_chan_del() function is used in scenarios where we've decided we want to get rid of the underlying baseband link. It makes therefore sense to force the disc_timeout to 0 so that the disconnection routines are immediately scheduled. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7815826a48e4..cb04a4e3c829 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1316,6 +1316,9 @@ void hci_chan_del(struct hci_chan *chan) synchronize_rcu(); + /* Force the connection to be immediately dropped */ + conn->disc_timeout = 0; + hci_conn_drop(conn); hci_conn_put(conn); -- cgit v1.2.3 From f94b665dcf15324f5ac8aa639e47be0829b6409d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 00:41:44 +0300 Subject: Bluetooth: Ignore incoming data after initiating disconnection When hci_chan_del is called the disconnection routines get scheduled through a workqueue. If there's any incoming ACL data before the routines get executed there's a chance that a new hci_chan is created and the disconnection never happens. This patch adds a new hci_conn flag to indicate that we're in the process of driving the connection down. We set the flag in hci_chan_del and check for it in hci_chan_create so that no new channels are created for the same connection. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_conn.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 18c24f6fce6c..dbe73642c54c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -553,6 +553,7 @@ enum { HCI_CONN_FIPS, HCI_CONN_STK_ENCRYPT, HCI_CONN_AUTH_INITIATOR, + HCI_CONN_DROP, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index cb04a4e3c829..aaa7e388d026 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1291,6 +1291,11 @@ struct hci_chan *hci_chan_create(struct hci_conn *conn) BT_DBG("%s hcon %p", hdev->name, conn); + if (test_bit(HCI_CONN_DROP, &conn->flags)) { + BT_DBG("Refusing to create new hci_chan"); + return NULL; + } + chan = kzalloc(sizeof(*chan), GFP_KERNEL); if (!chan) return NULL; @@ -1318,6 +1323,7 @@ void hci_chan_del(struct hci_chan *chan) /* Force the connection to be immediately dropped */ conn->disc_timeout = 0; + set_bit(HCI_CONN_DROP, &conn->flags); hci_conn_drop(conn); hci_conn_put(conn); -- cgit v1.2.3 From bcbb655a180344d8004ede669228992bff1921e4 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:27 +0300 Subject: Bluetooth: Remove hci_conn_hold/drop from hci_chan We can't have hci_chan contribute to the "active" reference counting of the hci_conn since otherwise the connection would never get dropped when there are no more users (since hci_chan would be counted as a user). This patch removes hold() when creating the hci_chan and drop() when destroying it. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index aaa7e388d026..5157a0990732 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1301,7 +1301,6 @@ struct hci_chan *hci_chan_create(struct hci_conn *conn) return NULL; chan->conn = hci_conn_get(conn); - hci_conn_hold(conn); skb_queue_head_init(&chan->data_q); chan->state = BT_CONNECTED; @@ -1321,11 +1320,9 @@ void hci_chan_del(struct hci_chan *chan) synchronize_rcu(); - /* Force the connection to be immediately dropped */ - conn->disc_timeout = 0; + /* Prevent new hci_chan's to be created for this hci_conn */ set_bit(HCI_CONN_DROP, &conn->flags); - hci_conn_drop(conn); hci_conn_put(conn); skb_queue_purge(&chan->data_q); -- cgit v1.2.3 From e31fb86005a01b7df8427b09d0158da28d0c773a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:28 +0300 Subject: Bluetooth: Set discon_timeout to 0 in l2cap_conn_del When the l2cap_conn_del() function is used we do not want to wait around "in case something happens" before disconnecting. This patch sets the disconnection timeout to 0 so that the disconnection routines get immediately scheduled. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index cb36169ef300..2d550afe4322 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1640,6 +1640,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_unregister_all_users(conn); + /* Force the connection to be immediately dropped */ + hcon->disc_timeout = 0; + mutex_lock(&conn->chan_lock); /* Kill channels */ -- cgit v1.2.3 From 1e91c29eb60c031f4297d1a58125d0bd37691348 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:29 +0300 Subject: Bluetooth: Use hci_disconnect for immediate disconnection from SMP Relying on the l2cap_conn_del procedure (triggered through the l2cap_conn_shutdown API) to get the connection disconnected is not reliable as it depends on all users releasing (through hci_conn_drop) and that there's at least one user (so hci_conn_drop is called at least one time). A much simpler and more reliable solution is to call hci_disconnect() directly from the SMP code when we want to disconnect. One side-effect this has is that it prevents any SMP Failure PDU from being sent before the disconnection, however neither one of the scenarios where l2cap_conn_shutdown was used really requires this. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 07ca4ce0943b..496584921fdc 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -829,7 +829,7 @@ static void smp_timeout(struct work_struct *work) BT_DBG("conn %p", conn); - l2cap_conn_shutdown(conn, ETIMEDOUT); + hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) @@ -1569,7 +1569,7 @@ static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) if (smp) cancel_delayed_work_sync(&smp->security_timer); - l2cap_conn_shutdown(chan->conn, -err); + hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE); } return err; -- cgit v1.2.3 From b04afa0c280b7e7ced88692251d75a78c8fcb2a7 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:30 +0300 Subject: Bluetooth: Remove unused l2cap_conn_shutdown API Now that there are no more users of the l2cap_conn_shutdown API (since smp.c switched to using hci_disconnect) we can simply remove it along with all of it's l2cap_conn variables. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 4 ---- net/bluetooth/l2cap_core.c | 25 ------------------------- 2 files changed, 29 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 8f1652ed3326..be25eddea615 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -625,9 +625,6 @@ struct l2cap_conn { struct delayed_work info_timer; - int disconn_err; - struct work_struct disconn_work; - struct sk_buff *rx_skb; __u32 rx_len; __u8 tx_ident; @@ -947,7 +944,6 @@ void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, u8 status); void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); -void l2cap_conn_shutdown(struct l2cap_conn *conn, int err); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2d550afe4322..2d9a2b58d2c8 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1635,9 +1635,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); - if (work_pending(&conn->disconn_work)) - cancel_work_sync(&conn->disconn_work); - l2cap_unregister_all_users(conn); /* Force the connection to be immediately dropped */ @@ -1670,26 +1667,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_conn_put(conn); } -static void disconn_work(struct work_struct *work) -{ - struct l2cap_conn *conn = container_of(work, struct l2cap_conn, - disconn_work); - - BT_DBG("conn %p", conn); - - l2cap_conn_del(conn->hcon, conn->disconn_err); -} - -void l2cap_conn_shutdown(struct l2cap_conn *conn, int err) -{ - struct hci_dev *hdev = conn->hcon->hdev; - - BT_DBG("conn %p err %d", conn, err); - - conn->disconn_err = err; - queue_work(hdev->workqueue, &conn->disconn_work); -} - static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); @@ -6943,8 +6920,6 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); - INIT_WORK(&conn->disconn_work, disconn_work); - skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); -- cgit v1.2.3 From 9b7b18ef1bea82e5fc1e05da386ff57b0f60f651 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:31 +0300 Subject: Bluetooth: Fix SMP error and response to be mutually exclusive Returning failure from the SMP data parsing function will cause an immediate disconnect, making any attempts to send a response PDU futile. This patch updates the function to always either send a response or return an error, but never both at the same time: * In the case that HCI_LE_ENABLED is not set we want to send a Pairing Not Supported response but it is not required to force a disconnection, so do not set the error return in this case. * If we get garbage SMP data we can just fail with the handler function instead of also trying to send an SMP Failure PDU. * There's no reason to force a disconnection if we receive an unknown SMP command. Instead simply send a proper Command Not Supported SMP response. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 496584921fdc..16c181181775 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1430,7 +1430,6 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) return -EILSEQ; if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) { - err = -EOPNOTSUPP; reason = SMP_PAIRING_NOTSUPP; goto done; } @@ -1447,7 +1446,6 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); - reason = SMP_CMD_NOTSUPP; err = -EOPNOTSUPP; goto done; } @@ -1459,7 +1457,6 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) case SMP_CMD_PAIRING_FAIL: smp_failure(conn, 0); - reason = 0; err = -EPERM; break; @@ -1501,17 +1498,17 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) default: BT_DBG("Unknown command code 0x%2.2x", code); - reason = SMP_CMD_NOTSUPP; - err = -EOPNOTSUPP; goto done; } done: - if (reason) - smp_failure(conn, reason); - if (!err) + if (!err) { + if (reason) + smp_failure(conn, reason); kfree_skb(skb); + } + return err; } -- cgit v1.2.3 From e3b679d56caa2bc555dee646a6ac5861631e7a28 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:32 +0300 Subject: Bluetooth: Update hci_disconnect() to return an error value We'll soon use hci_disconnect() from places that are interested to know whether the hci_send_cmd() really succeeded or not. This patch updates hci_disconnect() to pass on any error returned from hci_send_cmd(). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index dbe73642c54c..2b6e04d37593 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -703,7 +703,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, return NULL; } -void hci_disconnect(struct hci_conn *conn, __u8 reason); +int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5157a0990732..dd2df20b0f7d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -122,7 +122,7 @@ static void hci_reject_sco(struct hci_conn *conn) hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); } -void hci_disconnect(struct hci_conn *conn, __u8 reason) +int hci_disconnect(struct hci_conn *conn, __u8 reason) { struct hci_cp_disconnect cp; @@ -132,7 +132,7 @@ void hci_disconnect(struct hci_conn *conn, __u8 reason) cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; - hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); + return hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); } static void hci_amp_disconn(struct hci_conn *conn) -- cgit v1.2.3 From e3f2f92a047cd2be3c87a2aaf0a8958e1fd4c17a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:33 +0300 Subject: Bluetooth: Use hci_disconnect() for mgmt_disconnect_device() There's no reason to custom build the HCI_Disconnect command in the Disconnect Device mgmt command handler. This patch updates the code to use hci_disconnect() instead. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d8c66663ade8..ab9521ae3c63 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2788,7 +2788,6 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_disconnect *cp = data; struct mgmt_rp_disconnect rp; - struct hci_cp_disconnect dc; struct pending_cmd *cmd; struct hci_conn *conn; int err; @@ -2836,10 +2835,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - dc.handle = cpu_to_le16(conn->handle); - dc.reason = HCI_ERROR_REMOTE_USER_TERM; - - err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc); + err = hci_disconnect(conn, HCI_ERROR_REMOTE_USER_TERM); if (err < 0) mgmt_pending_remove(cmd); -- cgit v1.2.3 From 839035a7b3acd17a6f739b1fb50298e9499d3fa4 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 18 Aug 2014 20:33:34 +0300 Subject: Bluetooth: Move clock offset reading into hci_disconnect() To give all hci_disconnect() users the advantage of getting the clock offset read automatically this patch moves the necessary code from hci_conn_timeout() into hci_disconnect(). This way we pretty much always update the clock offset when disconnecting. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index dd2df20b0f7d..e3d7ae9e2edd 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -128,6 +128,19 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason) BT_DBG("hcon %p", conn); + /* When we are master of an established connection and it enters + * the disconnect timeout, then go ahead and try to read the + * current clock offset. Processing of the result is done + * within the event handling and hci_clock_offset_evt function. + */ + if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER) { + struct hci_dev *hdev = conn->hdev; + struct hci_cp_read_clock_offset cp; + + cp.handle = cpu_to_le16(conn->handle); + hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(cp), &cp); + } + conn->state = BT_DISCONN; cp.handle = cpu_to_le16(conn->handle); @@ -325,25 +338,6 @@ static void hci_conn_timeout(struct work_struct *work) hci_amp_disconn(conn); } else { __u8 reason = hci_proto_disconn_ind(conn); - - /* When we are master of an established connection - * and it enters the disconnect timeout, then go - * ahead and try to read the current clock offset. - * - * Processing of the result is done within the - * event handling and hci_clock_offset_evt function. - */ - if (conn->type == ACL_LINK && - conn->role == HCI_ROLE_MASTER) { - struct hci_dev *hdev = conn->hdev; - struct hci_cp_read_clock_offset cp; - - cp.handle = cpu_to_le16(conn->handle); - - hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, - sizeof(cp), &cp); - } - hci_disconnect(conn, reason); } break; -- cgit v1.2.3 From 434714dc02b286d3f21179c651a6f1a84e199eb7 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 1 Sep 2014 09:45:03 +0300 Subject: Bluetooth: Add clarifying comment for LE CoC result value The "pending" L2CAP response value is not defined for LE CoC. This patch adds a clarifying comment to the code so that the reader will not think there is a bug in trying to use this value for LE CoC. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2d9a2b58d2c8..ab405f0e53cb 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5413,6 +5413,11 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { l2cap_state_change(chan, BT_CONNECT2); + /* The following result value is actually not defined + * for LE CoC but we use it to let the function know + * that it should bail out after doing its cleanup + * instead of sending a response. + */ result = L2CAP_CR_PEND; chan->ops->defer(chan); } else { -- cgit v1.2.3 From 1b0921d6be7860271ccf6027891b8215fc28cde5 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:48 +0300 Subject: Bluetooth: Remove unnecessary checks after canceling SMP security timer The SMP security timer used to be able to modify the SMP context state but now days it simply calls hci_disconnect(). It is therefore unnecessary to have extra sanity checks for the SMP context after canceling the timer. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 16c181181775..b8ecc7bd3e3b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -282,8 +282,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) smp = chan->data; cancel_delayed_work_sync(&smp->security_timer); - if (test_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) - schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); + schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); } static __u8 authreq_to_seclevel(__u8 authreq) @@ -375,9 +374,6 @@ static void smp_chan_destroy(struct l2cap_conn *conn) BUG_ON(!smp); cancel_delayed_work_sync(&smp->security_timer); - /* In case the timeout freed the SMP context */ - if (!chan->data) - return; if (work_pending(&smp->distribute_work)) { cancel_work_sync(&smp->distribute_work); -- cgit v1.2.3 From 84bc0db53b3a425fb992d5fed25b575e4434167a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:49 +0300 Subject: Bluetooth: Don't take any action in smp_resume_cb if not encrypted When smp_resume_cb is called if we're not encrypted (i.e. the callback wasn't called because the connection became encrypted) we shouldn't take any action at all. This patch moves also the security_timer cancellation behind this condition. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index b8ecc7bd3e3b..9accb4739488 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1533,10 +1533,12 @@ static void smp_resume_cb(struct l2cap_chan *chan) if (!smp) return; + if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) + return; + cancel_delayed_work(&smp->security_timer); - if (test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) - queue_work(hdev->workqueue, &smp->distribute_work); + queue_work(hdev->workqueue, &smp->distribute_work); } static void smp_ready_cb(struct l2cap_chan *chan) -- cgit v1.2.3 From f3d82d0c8ec025fc113408e3ad5775fed5a060ff Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:50 +0300 Subject: Bluetooth: Move identity address update behind a workqueue The identity address update of all channels for an l2cap_conn needs to take the lock for each channel, i.e. it's safest to do this by a separate workqueue callback. Previously this was partially solved by moving the entire SMP key distribution behind a workqueue. However, if we want SMP context locking to be correct and safe we should always use the l2cap_chan lock when accessing it, meaning even smp_distribute_keys needs to take that lock which would once again create a dead lock when updating the identity address. The simplest way to solve this is to have l2cap_conn manage the deferred work which is what this patch does. A subsequent patch will remove the now unnecessary SMP key distribution work struct. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 3 ++- net/bluetooth/l2cap_core.c | 10 ++++++++-- net/bluetooth/smp.c | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index be25eddea615..ead99f032f7a 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -633,6 +633,8 @@ struct l2cap_conn { struct sk_buff_head pending_rx; struct work_struct pending_rx_work; + struct work_struct id_addr_update_work; + __u8 disc_reason; struct l2cap_chan *smp; @@ -937,7 +939,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan); void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); void l2cap_chan_del(struct l2cap_chan *chan, int err); -void l2cap_conn_update_id_addr(struct hci_conn *hcon); void l2cap_send_conn_req(struct l2cap_chan *chan); void l2cap_move_start(struct l2cap_chan *chan); void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ab405f0e53cb..b71430caab4a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -631,9 +631,11 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) } EXPORT_SYMBOL_GPL(l2cap_chan_del); -void l2cap_conn_update_id_addr(struct hci_conn *hcon) +static void l2cap_conn_update_id_addr(struct work_struct *work) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn = container_of(work, struct l2cap_conn, + id_addr_update_work); + struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; mutex_lock(&conn->chan_lock); @@ -1635,6 +1637,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + if (work_pending(&conn->id_addr_update_work)) + cancel_work_sync(&conn->id_addr_update_work); + l2cap_unregister_all_users(conn); /* Force the connection to be immediately dropped */ @@ -6927,6 +6932,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); + INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr); conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 9accb4739488..795c603bed30 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -654,7 +654,7 @@ static void smp_notify_keys(struct l2cap_conn *conn) */ bacpy(&hcon->dst, &smp->remote_irk->bdaddr); hcon->dst_type = smp->remote_irk->addr_type; - l2cap_conn_update_id_addr(hcon); + queue_work(hdev->workqueue, &conn->id_addr_update_work); /* When receiving an indentity resolving key for * a remote device that does not use a resolvable -- cgit v1.2.3 From d6268e86a12a94a4f5193551c2367162e6a37db4 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:51 +0300 Subject: Bluetooth: Remove unnecessary deferred work for SMP key distribution Now that the identity address update happens through its own deferred work there's no need to have smp_distribute_keys anymore behind a second deferred work. This patch removes this extra construction and makes the code do direct calls to smp_distribute_keys() again. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 795c603bed30..0b4403f3dce1 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -46,7 +46,6 @@ enum { struct smp_chan { struct l2cap_conn *conn; struct delayed_work security_timer; - struct work_struct distribute_work; u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ @@ -375,12 +374,6 @@ static void smp_chan_destroy(struct l2cap_conn *conn) cancel_delayed_work_sync(&smp->security_timer); - if (work_pending(&smp->distribute_work)) { - cancel_work_sync(&smp->distribute_work); - if (!chan->data) - return; - } - complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(conn->hcon, complete); @@ -703,10 +696,8 @@ static void smp_notify_keys(struct l2cap_conn *conn) } } -static void smp_distribute_keys(struct work_struct *work) +static void smp_distribute_keys(struct smp_chan *smp) { - struct smp_chan *smp = container_of(work, struct smp_chan, - distribute_work); struct smp_cmd_pairing *req, *rsp; struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; @@ -850,7 +841,6 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->conn = conn; chan->data = smp; - INIT_WORK(&smp->distribute_work, smp_distribute_keys); INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); hci_conn_hold(conn->hcon); @@ -1290,7 +1280,7 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) rp->ediv, rp->rand); smp->ltk = ltk; if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - queue_work(hdev->workqueue, &smp->distribute_work); + smp_distribute_keys(smp); hci_dev_unlock(hdev); return 0; @@ -1368,7 +1358,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, smp->id_addr_type, smp->irk, &rpa); distribute: - queue_work(hdev->workqueue, &smp->distribute_work); + smp_distribute_keys(smp); hci_dev_unlock(hcon->hdev); @@ -1404,7 +1394,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; - queue_work(hdev->workqueue, &smp->distribute_work); + smp_distribute_keys(smp); hci_dev_unlock(hdev); return 0; @@ -1526,7 +1516,6 @@ static void smp_resume_cb(struct l2cap_chan *chan) struct smp_chan *smp = chan->data; struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; BT_DBG("chan %p", chan); @@ -1538,7 +1527,7 @@ static void smp_resume_cb(struct l2cap_chan *chan) cancel_delayed_work(&smp->security_timer); - queue_work(hdev->workqueue, &smp->distribute_work); + smp_distribute_keys(smp); } static void smp_ready_cb(struct l2cap_chan *chan) -- cgit v1.2.3 From fc75cc8684d21d3649b28c4c37d4ce3f000759e4 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:52 +0300 Subject: Bluetooth: Fix locking of the SMP context Before the move the l2cap_chan the SMP context (smp_chan) didn't have any kind of proper locking. The best there existed was the HCI_CONN_LE_SMP_PEND flag which was used to enable mutual exclusion for potential multiple creators of the SMP context. Now that SMP has been converted to use the l2cap_chan infrastructure and since the SMP context is directly mapped to a corresponding l2cap_chan we get the SMP context locking essentially for free through the l2cap_chan lock. For all callbacks that l2cap_core.c makes for each channel implementation (smp.c in the case of SMP) the l2cap_chan lock is held through l2cap_chan_lock(chan). Since the calls from l2cap_core.c to smp.c are covered the only missing piece to have the locking implemented properly is to ensure that the lock is held for any other call path that may access the SMP context. This means user responses through mgmt.c, requests to elevate the security of a connection through hci_conn.c, as well as any deferred work through workqueues. This patch adds the necessary locking to all these other code paths that try to access the SMP context. Since mutual exclusion for the l2cap_chan access is now covered from all directions the patch also removes unnecessary HCI_CONN_LE_SMP_PEND flag (once we've acquired the chan lock we can simply check whether chan->smp is set to know if there's an SMP context). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - net/bluetooth/smp.c | 76 +++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2b6e04d37593..045d9133d180 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -539,7 +539,6 @@ enum { HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, HCI_CONN_SCO_SETUP_PEND, - HCI_CONN_LE_SMP_PEND, HCI_CONN_MGMT_CONNECTED, HCI_CONN_SSP_ENABLED, HCI_CONN_SC_ENABLED, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 0b4403f3dce1..c71589f4b435 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -409,7 +409,6 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; - struct smp_chan *smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), @@ -419,12 +418,7 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) mgmt_auth_failed(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, HCI_ERROR_AUTH_FAILURE); - if (!chan->data) - return; - - smp = chan->data; - - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + if (chan->data) smp_chan_destroy(conn); } @@ -706,9 +700,6 @@ static void smp_distribute_keys(struct smp_chan *smp) BT_DBG("conn %p", conn); - if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return; - rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ @@ -801,7 +792,6 @@ static void smp_distribute_keys(struct smp_chan *smp) if ((smp->remote_key_dist & 0x07)) return; - clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); @@ -825,16 +815,13 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) struct smp_chan *smp; smp = kzalloc(sizeof(*smp), GFP_ATOMIC); - if (!smp) { - clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags); + if (!smp) return NULL; - } smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create ECB crypto context"); kfree(smp); - clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags); return NULL; } @@ -854,16 +841,23 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) struct l2cap_chan *chan; struct smp_chan *smp; u32 value; + int err; BT_DBG(""); - if (!conn || !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + if (!conn) return -ENOTCONN; chan = conn->smp; if (!chan) return -ENOTCONN; + l2cap_chan_lock(chan); + if (!chan->data) { + err = -ENOTCONN; + goto unlock; + } + smp = chan->data; switch (mgmt_op) { @@ -879,12 +873,16 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); - return 0; + err = 0; + goto unlock; default: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto unlock; } + err = 0; + /* If it is our turn to send Pairing Confirm, do so now */ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { u8 rsp = smp_confirm(smp); @@ -892,12 +890,15 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) smp_failure(conn, rsp); } - return 0; +unlock: + l2cap_chan_unlock(chan); + return err; } static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; + struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp; u8 key_size, auth, sec_level; @@ -911,12 +912,10 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (conn->hcon->role != HCI_ROLE_SLAVE) return SMP_CMD_NOTSUPP; - if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { + if (!chan->data) smp = smp_chan_create(conn); - } else { - struct l2cap_chan *chan = conn->smp; + else smp = chan->data; - } if (!smp) return SMP_UNSPECIFIED; @@ -1122,6 +1121,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_security_req *rp = (void *) skb->data; struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; + struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; u8 sec_level; @@ -1143,7 +1143,8 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + /* If SMP is already in progress ignore this request */ + if (chan->data) return 0; smp = smp_chan_create(conn); @@ -1170,8 +1171,10 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; __u8 authreq; + int ret; BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); @@ -1192,12 +1195,19 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; + l2cap_chan_lock(chan); + + /* If SMP is already in progress ignore this request */ + if (chan->data) { + ret = 0; + goto unlock; + } smp = smp_chan_create(conn); - if (!smp) - return 1; + if (!smp) { + ret = 1; + goto unlock; + } authreq = seclevel_to_authreq(sec_level); @@ -1223,8 +1233,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) } set_bit(SMP_FLAG_INITIATOR, &smp->flags); + ret = 0; - return 0; +unlock: + l2cap_chan_unlock(chan); + return ret; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) @@ -1315,7 +1328,6 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; bdaddr_t rpa; BT_DBG(""); @@ -1430,7 +1442,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) * returns an error). */ if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && - !test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { + !chan->data) { BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); err = -EOPNOTSUPP; goto done; @@ -1504,7 +1516,7 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) BT_DBG("chan %p", chan); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + if (chan->data) smp_chan_destroy(conn); conn->smp = NULL; -- cgit v1.2.3 From 88d3a8acf33e8f4989a1032998eb819a89829573 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:53 +0300 Subject: Bluetooth: Add define for key distribution mask This patch adds a define for the allowed bits of the key distribution mask so we don't have to have magic 0x07 constants throughout the code. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c71589f4b435..5003d224c4b6 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -34,6 +34,7 @@ #define SMP_TIMEOUT msecs_to_jiffies(30000) #define AUTH_REQ_MASK 0x07 +#define KEY_DIST_MASK 0x07 enum { SMP_FLAG_TK_VALID, @@ -703,7 +704,7 @@ static void smp_distribute_keys(struct smp_chan *smp) rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ - if (hcon->out && (smp->remote_key_dist & 0x07)) + if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) return; req = (void *) &smp->preq[1]; @@ -789,7 +790,7 @@ static void smp_distribute_keys(struct smp_chan *smp) } /* If there are still keys to be received wait for them */ - if ((smp->remote_key_dist & 0x07)) + if (smp->remote_key_dist & KEY_DIST_MASK) return; set_bit(SMP_FLAG_COMPLETE, &smp->flags); -- cgit v1.2.3 From c6e81e9ae61cae3ea265e8f7fb2cbe59afc63594 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:54 +0300 Subject: Bluetooth: Fix calling smp_distribute_keys() when still waiting for keys When we're in the process of receiving keys in phase 3 of SMP we keep track of which keys are still expected in the smp->remote_key_dist variable. If we still have some key bits set we need to continue waiting for more PDUs and not needlessly call smp_distribute_keys(). This patch fixes two such cases in the smp_cmd_master_ident() and smp_cmd_ident_addr_info() handler functions. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 5003d224c4b6..e76c963011e5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1293,7 +1293,7 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) authenticated, smp->tk, smp->enc_key_size, rp->ediv, rp->rand); smp->ltk = ltk; - if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) + if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); hci_dev_unlock(hdev); @@ -1371,7 +1371,8 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, smp->id_addr_type, smp->irk, &rpa); distribute: - smp_distribute_keys(smp); + if (!(smp->remote_key_dist & KEY_DIST_MASK)) + smp_distribute_keys(smp); hci_dev_unlock(hcon->hdev); -- cgit v1.2.3 From b28b4943660f4e36f118b751ec606c103ba6b1cc Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 5 Sep 2014 22:19:55 +0300 Subject: Bluetooth: Add strict checks for allowed SMP PDUs SMP defines quite clearly when certain PDUs are to be expected/allowed and when not, but doesn't have any explicit request/response definition. So far the code has relied on each PDU handler to behave correctly if receiving PDUs at an unexpected moment, however this requires many different checks and is prone to errors. This patch introduces a generic way to keep track of allowed PDUs and thereby reduces the responsibility & load on individual command handlers. The tracking is implemented using a simple bit-mask where each opcode maps to its own bit. If the bit is set the corresponding PDU is allow and if the bit is not set the PDU is not allowed. As a simple example, when we send the Pairing Request we'd set the bit for Pairing Response, and when we receive the Pairing Response we'd clear the bit for Pairing Response. Since the disallowed PDU rejection is now done in a single central place we need to be a bit careful of which action makes most sense to all cases. Previously some, such as Security Request, have been simply ignored whereas others have caused an explicit disconnect. The only PDU rejection action that keeps good interoperability and can be used for all the applicable use cases is to drop the data. This may raise some concerns of us now being more lenient for misbehaving (and potentially malicious) devices, but the policy of simply dropping data has been a successful one for many years e.g. in L2CAP (where this is the *only* policy for such cases - we never request disconnection in l2cap_core.c because of bad data). Furthermore, we cannot prevent connected devices from creating the SMP context (through a Security or Pairing Request), and once the context exists looking up the corresponding bit for the received opcode and deciding to reject it is essentially an equally lightweight operation as the kind of rejection that l2cap_core.c already successfully does. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 120 +++++++++++++++++++++++++++++++++++----------------- net/bluetooth/smp.h | 2 + 2 files changed, 84 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index e76c963011e5..1201670afe38 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -31,6 +31,9 @@ #include "smp.h" +#define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) +#define SMP_DISALLOW_CMD(smp, code) clear_bit(code, &smp->allow_cmd) + #define SMP_TIMEOUT msecs_to_jiffies(30000) #define AUTH_REQ_MASK 0x07 @@ -47,6 +50,7 @@ enum { struct smp_chan { struct l2cap_conn *conn; struct delayed_work security_timer; + unsigned long allow_cmd; /* Bitmask of allowed commands */ u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ @@ -553,6 +557,11 @@ static u8 smp_confirm(struct smp_chan *smp) smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); + if (conn->hcon->out) + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); + else + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); + return 0; } @@ -691,6 +700,20 @@ static void smp_notify_keys(struct l2cap_conn *conn) } } +static void smp_allow_key_dist(struct smp_chan *smp) +{ + /* Allow the first expected phase 3 PDU. The rest of the PDUs + * will be allowed in each PDU handler to ensure we receive + * them in the correct order. + */ + if (smp->remote_key_dist & SMP_DIST_ENC_KEY) + SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); + else if (smp->remote_key_dist & SMP_DIST_ID_KEY) + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); + else if (smp->remote_key_dist & SMP_DIST_SIGN) + SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); +} + static void smp_distribute_keys(struct smp_chan *smp) { struct smp_cmd_pairing *req, *rsp; @@ -704,8 +727,10 @@ static void smp_distribute_keys(struct smp_chan *smp) rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ - if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) + if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) { + smp_allow_key_dist(smp); return; + } req = (void *) &smp->preq[1]; @@ -790,8 +815,10 @@ static void smp_distribute_keys(struct smp_chan *smp) } /* If there are still keys to be received wait for them */ - if (smp->remote_key_dist & KEY_DIST_MASK) + if (smp->remote_key_dist & KEY_DIST_MASK) { + smp_allow_key_dist(smp); return; + } set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); @@ -829,6 +856,8 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->conn = conn; chan->data = smp; + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL); + INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); hci_conn_hold(conn->hcon); @@ -925,6 +954,8 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) (req->auth_req & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; + SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); + smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); @@ -958,6 +989,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); /* Request setup of TK */ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability); @@ -983,6 +1015,8 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (conn->hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; + SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); + skb_pull(skb, sizeof(*rsp)); req = (void *) &smp->preq[1]; @@ -1040,13 +1074,19 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; + SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); + memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); - if (conn->hcon->out) + if (conn->hcon->out) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); - else if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); + return 0; + } + + if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); else set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); @@ -1064,6 +1104,8 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(smp->rrnd)) return SMP_INVALID_PARAMS; + SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); + memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd)); skb_pull(skb, sizeof(smp->rrnd)); @@ -1122,7 +1164,6 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_security_req *rp = (void *) skb->data; struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; - struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; u8 sec_level; @@ -1144,10 +1185,6 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - /* If SMP is already in progress ignore this request */ - if (chan->data) - return 0; - smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; @@ -1165,6 +1202,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); return 0; } @@ -1227,10 +1265,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); } else { struct smp_cmd_security_req cp; cp.auth_req = authreq; smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); } set_bit(SMP_FLAG_INITIATOR, &smp->flags); @@ -1252,9 +1292,8 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ENC_KEY)) - return 0; + SMP_DISALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); + SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); skb_pull(skb, sizeof(*rp)); @@ -1278,13 +1317,13 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ENC_KEY)) - return 0; - /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; + SMP_DISALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); + if (smp->remote_key_dist & SMP_DIST_ID_KEY) + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); + skb_pull(skb, sizeof(*rp)); hci_dev_lock(hdev); @@ -1311,9 +1350,8 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - return 0; + SMP_DISALLOW_CMD(smp, SMP_CMD_IDENT_INFO); + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); @@ -1336,13 +1374,13 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - return 0; - /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ID_KEY; + SMP_DISALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); + if (smp->remote_key_dist & SMP_DIST_SIGN) + SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); + skb_pull(skb, sizeof(*info)); hci_dev_lock(hcon->hdev); @@ -1392,13 +1430,11 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_SIGN)) - return 0; - /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_SIGN; + SMP_DISALLOW_CMD(smp, SMP_CMD_SIGN_INFO); + skb_pull(skb, sizeof(*rp)); hci_dev_lock(hdev); @@ -1418,6 +1454,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; + struct smp_chan *smp; __u8 code, reason; int err = 0; @@ -1437,18 +1474,19 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) code = skb->data[0]; skb_pull(skb, sizeof(code)); - /* - * The SMP context must be initialized for all other PDUs except - * pairing and security requests. If we get any other PDU when - * not initialized simply disconnect (done if this function - * returns an error). + smp = chan->data; + + if (code > SMP_CMD_MAX) + goto drop; + + if (smp && !test_bit(code, &smp->allow_cmd)) + goto drop; + + /* If we don't have a context the only allowed commands are + * pairing request and security request. */ - if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && - !chan->data) { - BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); - err = -EOPNOTSUPP; - goto done; - } + if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ) + goto drop; switch (code) { case SMP_CMD_PAIRING_REQ: @@ -1510,6 +1548,12 @@ done: } return err; + +drop: + BT_ERR("%s unexpected SMP command 0x%02x from %pMR", hcon->hdev->name, + code, &hcon->dst); + kfree_skb(skb); + return 0; } static void smp_teardown_cb(struct l2cap_chan *chan, int err) diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index cf1094617c69..5240537efde3 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -102,6 +102,8 @@ struct smp_cmd_security_req { __u8 auth_req; } __packed; +#define SMP_CMD_MAX 0x0b + #define SMP_PASSKEY_ENTRY_FAILED 0x01 #define SMP_OOB_NOT_AVAIL 0x02 #define SMP_AUTH_REQUIREMENTS 0x03 -- cgit v1.2.3 From 9f06a8d623b2c3aea09292b844e329fbfb401231 Mon Sep 17 00:00:00 2001 From: Behan Webster Date: Fri, 5 Sep 2014 16:03:34 -0700 Subject: Bluetooth: LLVMLinux: Remove VLAIS from bluetooth/amp.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaced the use of a Variable Length Array In Struct (VLAIS) with a C99 compliant equivalent. This patch allocates the appropriate amount of memory using an char array. The new code can be compiled with both gcc and clang. struct shash_desc contains a flexible array member member ctx declared with CRYPTO_MINALIGN_ATTR, so sizeof(struct shash_desc) aligns the beginning of the array declared after struct shash_desc with long long. No trailing padding is required because it is not a struct type that can be used in an array. The CRYPTO_MINALIGN_ATTR is required so that desc is aligned with long long as would be the case for a struct containing a member with CRYPTO_MINALIGN_ATTR. Signed-off-by: Behan Webster Signed-off-by: Mark Charlebois Signed-off-by: Jan-Simon Möller Signed-off-by: Marcel Holtmann --- net/bluetooth/amp.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 016cdb66df6c..2640d78f30b8 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -149,15 +149,14 @@ static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output) if (ret) { BT_DBG("crypto_ahash_setkey failed: err %d", ret); } else { - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(tfm)]; - } desc; + char desc[sizeof(struct shash_desc) + + crypto_shash_descsize(tfm)] CRYPTO_MINALIGN_ATTR; + struct shash_desc *shash = (struct shash_desc *)desc; - desc.shash.tfm = tfm; - desc.shash.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + shash->tfm = tfm; + shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - ret = crypto_shash_digest(&desc.shash, plaintext, psize, + ret = crypto_shash_digest(shash, plaintext, psize, output); } -- cgit v1.2.3 From c68b7f127d5f517c214e8bcf231d0188f6776d2a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sat, 6 Sep 2014 06:59:10 +0300 Subject: Bluetooth: Fix dereferencing conn variable before NULL check This patch fixes the following type of static analyzer warning (and probably a real bug as well as the NULL check should be there for a reason): net/bluetooth/smp.c:1182 smp_conn_security() warn: variable dereferenced before check 'conn' (see line 1174) Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1201670afe38..560f78a9f960 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1210,7 +1210,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; - struct l2cap_chan *chan = conn->smp; + struct l2cap_chan *chan; struct smp_chan *smp; __u8 authreq; int ret; @@ -1221,6 +1221,8 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; + chan = conn->smp; + if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) return 1; -- cgit v1.2.3 From 1e701f16982a9d15488a5aa8c7f5c41444b1de67 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 8 Sep 2014 08:29:12 -0700 Subject: net: Fix GRE RX to use skb_transport_header for GRE header offset GRE assumes that the GRE header is at skb_network_header + ip_hrdlen(skb). It is more general to use skb_transport_header and this allows the possbility of inserting additional header between IP and GRE (which is what we will done in Generic UDP Encapsulation for GRE). Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7e0756da8737..4a7b5b2a1ce3 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -98,7 +98,6 @@ EXPORT_SYMBOL_GPL(gre_build_header); static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { - unsigned int ip_hlen = ip_hdrlen(skb); const struct gre_base_hdr *greh; __be32 *options; int hdr_len; @@ -106,7 +105,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) return -EINVAL; - greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + greh = (struct gre_base_hdr *)skb_transport_header(skb); if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; @@ -116,7 +115,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!pskb_may_pull(skb, hdr_len)) return -EINVAL; - greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + greh = (struct gre_base_hdr *)skb_transport_header(skb); tpi->proto = greh->protocol; options = (__be32 *)(greh + 1); -- cgit v1.2.3 From a7f26b7e1ee73ac9e766c430fea5af658d839954 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Sep 2014 19:08:34 -0400 Subject: inet: remove dead inetpeer sequence code inetpeer sequence numbers are no longer incremented, so no need to check and flush the tree. The function that increments the sequence number was already dead code and removed in in "ipv4: remove unused function" (068a6e18). Remove the code that checks for a change, too. Verifying that v4_seq and v6_seq are never incremented and thus that flush_check compares bp->flush_seq to 0 is trivial. The second part of the change removes flush_check completely even though bp->flush_seq is exactly !0 once, at initialization. This change is correct because the time this branch is true is when bp->root == peer_avl_empty_rcu, in which the branch and inetpeer_invalidate_tree are a NOOP. Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inetpeer.h | 1 - net/ipv4/inetpeer.c | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 01d590ee5e7e..80479abddf73 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -61,7 +61,6 @@ struct inet_peer { struct inet_peer_base { struct inet_peer __rcu *root; seqlock_t lock; - u32 flush_seq; int total; }; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bd5f5928167d..241afd743d2c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -72,29 +72,10 @@ void inet_peer_base_init(struct inet_peer_base *bp) { bp->root = peer_avl_empty_rcu; seqlock_init(&bp->lock); - bp->flush_seq = ~0U; bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); -static atomic_t v4_seq = ATOMIC_INIT(0); -static atomic_t v6_seq = ATOMIC_INIT(0); - -static atomic_t *inetpeer_seq_ptr(int family) -{ - return (family == AF_INET ? &v4_seq : &v6_seq); -} - -static inline void flush_check(struct inet_peer_base *base, int family) -{ - atomic_t *fp = inetpeer_seq_ptr(family); - - if (unlikely(base->flush_seq != atomic_read(fp))) { - inetpeer_invalidate_tree(base); - base->flush_seq = atomic_read(fp); - } -} - #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ /* Exported for sysctl_net_ipv4. */ @@ -444,8 +425,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, unsigned int sequence; int invalidated, gccnt = 0; - flush_check(base, daddr->family); - /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ -- cgit v1.2.3 From e1e930f591bfd9604c3077f0af5c390f4f890259 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 8 Sep 2014 17:09:49 -0700 Subject: Bluetooth: Fix mgmt pairing failure when authentication fails Whether through HCI with BR/EDR or SMP with LE when authentication fails we should also notify any pending Pair Device mgmt command. This patch updates the mgmt_auth_failed function to take the actual hci_conn object and makes sure that any pending pairing command is notified and cleaned up appropriately. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +-- net/bluetooth/hci_event.c | 6 ++---- net/bluetooth/mgmt.c | 19 +++++++++++++------ net/bluetooth/smp.c | 3 +-- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 045d9133d180..206b92bfeebb 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1342,8 +1342,7 @@ int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u32 passkey, u8 entered); -void mgmt_auth_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status); +void mgmt_auth_failed(struct hci_conn *conn, u8 status); void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); void mgmt_sc_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e6a496ae0318..3a8381ab992f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2320,8 +2320,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->sec_level = conn->pending_sec_level; } } else { - mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, - ev->status); + mgmt_auth_failed(conn, ev->status); } clear_bit(HCI_CONN_AUTH_PEND, &conn->flags); @@ -3900,8 +3899,7 @@ static void hci_simple_pair_complete_evt(struct hci_dev *hdev, * event gets always produced as initiator and is also mapped to * the mgmt_auth_failed event */ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && ev->status) - mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, - ev->status); + mgmt_auth_failed(conn, ev->status); hci_conn_drop(conn); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index ab9521ae3c63..efb71b022ab6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6485,16 +6485,23 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, return mgmt_event(MGMT_EV_PASSKEY_NOTIFY, hdev, &ev, sizeof(ev), NULL); } -void mgmt_auth_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status) +void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status) { struct mgmt_ev_auth_failed ev; + struct pending_cmd *cmd; + u8 status = mgmt_status(hci_status); - bacpy(&ev.addr.bdaddr, bdaddr); - ev.addr.type = link_to_bdaddr(link_type, addr_type); - ev.status = mgmt_status(status); + bacpy(&ev.addr.bdaddr, &conn->dst); + ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type); + ev.status = status; - mgmt_event(MGMT_EV_AUTH_FAILED, hdev, &ev, sizeof(ev), NULL); + cmd = find_pairing(conn); + + mgmt_event(MGMT_EV_AUTH_FAILED, conn->hdev, &ev, sizeof(ev), + cmd ? cmd->sk : NULL); + + if (cmd) + pairing_complete(cmd, status); } void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 560f78a9f960..25c9040e0b12 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -420,8 +420,7 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason) &reason); clear_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags); - mgmt_auth_failed(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, - HCI_ERROR_AUTH_FAILURE); + mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); if (chan->data) smp_chan_destroy(conn); -- cgit v1.2.3 From 2ae50d8d3aaf7154f72b44331b71f15799cdc1bb Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 8 Sep 2014 12:11:43 +0300 Subject: Bluetooth: 6lowpan: Increase the connection timeout value Use the default connection timeout value defined in l2cap.h because the current timeout was too short and most of the time the connection attempts timed out. Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 3.17.x --- net/bluetooth/6lowpan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 35ebe79c87b0..5532e6e804f0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -890,7 +890,7 @@ static void chan_resume_cb(struct l2cap_chan *chan) static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) { - return msecs_to_jiffies(1000); + return L2CAP_CONN_TIMEOUT; } static const struct l2cap_ops bt_6lowpan_chan_ops = { -- cgit v1.2.3 From b2799cec22812f5f1aaaa57133df51876f685d84 Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 8 Sep 2014 12:11:44 +0300 Subject: Bluetooth: 6lowpan: Set the peer IPv6 address correctly The peer IPv6 address contained wrong U/L bit in the EUI-64 part. Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 3.17.x --- net/bluetooth/6lowpan.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 5532e6e804f0..b229aed3bf9a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -671,6 +671,14 @@ static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) return chan; } +static void set_ip_addr_bits(u8 addr_type, u8 *addr) +{ + if (addr_type == BDADDR_LE_PUBLIC) + *addr |= 0x02; + else + *addr &= ~0x02; +} + static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, struct lowpan_dev *dev) { @@ -693,6 +701,11 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, memcpy(&peer->eui64_addr, (u8 *)&peer->peer_addr.s6_addr + 8, EUI64_ADDR_LEN); + /* IPv6 address needs to have the U/L bit set properly so toggle + * it back here. + */ + set_ip_addr_bits(chan->dst_type, (u8 *)&peer->peer_addr.s6_addr + 8); + write_lock_irqsave(&devices_lock, flags); INIT_LIST_HEAD(&peer->list); peer_add(dev, peer); -- cgit v1.2.3 From 39e90c77637b3892a39f2908aea57539e961c50e Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 8 Sep 2014 12:11:45 +0300 Subject: Bluetooth: 6lowpan: Route packets that are not meant to peer via correct device Packets that are supposed to be delivered via the peer device need to be checked and sent to correct device. This requires that user has set the routes properly so that the 6lowpan module can then figure out the destination gateway and the correct Bluetooth device. Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 3.17.x --- net/bluetooth/6lowpan.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index b229aed3bf9a..0920cb6ed572 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -39,6 +39,7 @@ static struct dentry *lowpan_control_debugfs; struct skb_cb { struct in6_addr addr; + struct in6_addr gw; struct l2cap_chan *chan; int status; }; @@ -158,6 +159,54 @@ static inline struct lowpan_peer *peer_lookup_conn(struct lowpan_dev *dev, return NULL; } +static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, + struct in6_addr *daddr, + struct sk_buff *skb) +{ + struct lowpan_peer *peer, *tmp; + struct in6_addr *nexthop; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + int count = atomic_read(&dev->peer_count); + + BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); + + /* If we have multiple 6lowpan peers, then check where we should + * send the packet. If only one peer exists, then we can send the + * packet right away. + */ + if (count == 1) + return list_first_entry(&dev->peers, struct lowpan_peer, + list); + + if (!rt) { + nexthop = &lowpan_cb(skb)->gw; + + if (ipv6_addr_any(nexthop)) + return NULL; + } else { + nexthop = rt6_nexthop(rt); + + /* We need to remember the address because it is needed + * by bt_xmit() when sending the packet. In bt_xmit(), the + * destination routing info is not set. + */ + memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr)); + } + + BT_DBG("gw %pI6c", nexthop); + + list_for_each_entry_safe(peer, tmp, &dev->peers, list) { + BT_DBG("dst addr %pMR dst type %d ip %pI6c", + &peer->chan->dst, peer->chan->dst_type, + &peer->peer_addr); + + if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) + return peer; + } + + return NULL; +} + static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { struct lowpan_dev *entry, *tmp; @@ -415,8 +464,18 @@ static int header_create(struct sk_buff *skb, struct net_device *netdev, read_unlock_irqrestore(&devices_lock, flags); if (!peer) { - BT_DBG("no such peer %pMR found", &addr); - return -ENOENT; + /* The packet might be sent to 6lowpan interface + * because of routing (either via default route + * or user set route) so get peer according to + * the destination address. + */ + read_lock_irqsave(&devices_lock, flags); + peer = peer_lookup_dst(dev, &hdr->daddr, skb); + read_unlock_irqrestore(&devices_lock, flags); + if (!peer) { + BT_DBG("no such peer %pMR found", &addr); + return -ENOENT; + } } daddr = peer->eui64_addr; @@ -520,6 +579,8 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) read_lock_irqsave(&devices_lock, flags); peer = peer_lookup_ba(dev, &addr, addr_type); + if (!peer) + peer = peer_lookup_dst(dev, &lowpan_cb(skb)->addr, skb); read_unlock_irqrestore(&devices_lock, flags); BT_DBG("xmit %s to %pMR type %d IP %pI6c peer %p", -- cgit v1.2.3 From 2a5538e9aa4929329813bee69922c9ae4990fcad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Aug 2014 12:05:27 +0200 Subject: netfilter: nat: move specific NAT IPv6 to core Move the specific NAT IPv6 core functions that are called from the hooks from ip6table_nat.c to nf_nat_l3proto_ipv6.c. This prepares the ground to allow iptables and nft to use the same NAT engine code that comes in a follow up patch. This also renames nf_nat_ipv6_fn to nft_nat_ipv6_fn in net/ipv6/netfilter/nft_chain_nat_ipv6.c to avoid a compilation breakage. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l3proto.h | 37 +++++ net/ipv6/netfilter/ip6table_nat.c | 233 +++++-------------------------- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 199 ++++++++++++++++++++++++++ net/ipv6/netfilter/nft_chain_nat_ipv6.c | 10 +- 4 files changed, 275 insertions(+), 204 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index bc2d51574489..340c013795a4 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -84,4 +84,41 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); +unsigned int nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + +unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)); + #endif /* _NF_NAT_L3PROTO_H */ diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 387d8b8fc18d..b0634ac996b7 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -30,222 +30,57 @@ static const struct xt_table nf_nat_ipv6_table = { .af = NFPROTO_IPV6, }; -static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI6)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6 : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) +static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int ret; - ret = ip6t_do_table(skb, hooknum, in, out, net->ipv6.ip6table_nat); - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - ret = alloc_null_binding(ct, hooknum); - } - return ret; + return ip6t_do_table(skb, ops->hooknum, in, out, net->ipv6.ip6table_nat); } -static unsigned int -nf_nat_ipv6_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - __be16 frag_off; - int hdrlen; - u8 nexthdr; - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - nexthdr = ipv6_hdr(skb)->nexthdr; - hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, - hdrlen)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_ipv6_fn(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - unsigned int ret; - struct in6_addr daddr = ipv6_hdr(skb)->daddr; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv6_in(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_out(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET6); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; + return nf_nat_ipv6_out(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, - &ct->tuplehash[!dir].tuple.src.u3)) { - err = ip6_route_me_harder(skb); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET6); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; + return nf_nat_ipv6_local_fn(ops, skb, in, out, ip6table_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv6_in, + .hook = ip6table_nat_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, @@ -253,7 +88,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_out, + .hook = ip6table_nat_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, @@ -261,7 +96,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv6_local_fn, + .hook = ip6table_nat_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, @@ -269,7 +104,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_fn, + .hook = ip6table_nat_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index fc8e49b2ff3e..c5812e1c1ffb 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -261,6 +261,205 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); +unsigned int +nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = do_chain(ops, skb, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + + if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + break; + + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); + +unsigned int +nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + unsigned int ret; + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); + +unsigned int +nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); + +unsigned int +nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn); + static int __init nf_nat_l3proto_ipv6_init(void) { int err; diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index d189fcb437fe..c1c74491a10b 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -28,7 +28,7 @@ * IPv6 NAT chains */ -static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -97,7 +97,7 @@ static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops, struct in6_addr daddr = ipv6_hdr(skb)->daddr; unsigned int ret; - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -115,7 +115,7 @@ static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops, const struct nf_conn *ct __maybe_unused; unsigned int ret; - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -143,7 +143,7 @@ static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, const struct nf_conn *ct; unsigned int ret; - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -177,7 +177,7 @@ static const struct nf_chain_type nft_chain_nat_ipv6 = { [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, - [NF_INET_LOCAL_IN] = nf_nat_ipv6_fn, + [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, }, }; -- cgit v1.2.3 From 876665eafc0e43523d0c57b5c937b59696fb4a8f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Sep 2014 16:31:09 +0200 Subject: netfilter: nft_chain_nat_ipv6: use generic IPv6 NAT code from core Use the exported IPv6 NAT functions that are provided by the core. This removes duplicated code so iptables and nft use the same NAT codebase. Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nft_chain_nat_ipv6.c | 163 +++++++------------------------- 1 file changed, 36 insertions(+), 127 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index c1c74491a10b..1c4b75dd425b 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -24,144 +24,53 @@ #include #include -/* - * IPv6 NAT chains - */ - -static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - __be16 frag_off; - int hdrlen; - u8 nexthdr; struct nft_pktinfo pkt; - unsigned int ret; - - if (ct == NULL || nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED + IP_CT_IS_REPLY: - nexthdr = ipv6_hdr(skb)->nexthdr; - hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, - hdrlen)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall through */ - case IP_CT_NEW: - if (nf_nat_initialized(ct, maniptype)) - break; - - nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); - ret = nft_do_chain(&pkt, ops); - if (ret != NF_ACCEPT) - return ret; - if (!nf_nat_initialized(ct, maniptype)) { - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); - if (ret != NF_ACCEPT) - return ret; - } - default: - break; - } + nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nft_do_chain(&pkt, ops); } -static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct in6_addr daddr = ipv6_hdr(skb)->daddr; - unsigned int ret; - - ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv6_fn(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo __maybe_unused; - const struct nf_conn *ct __maybe_unused; - unsigned int ret; - - ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3) || - (ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) - if (nf_xfrm_me_harder(skb, AF_INET6) < 0) - ret = NF_DROP; - } -#endif - return ret; + return nf_nat_ipv6_in(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; - unsigned int ret; - - ret = nft_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + return nf_nat_ipv6_out(ops, skb, in, out, nft_nat_do_chain); +} - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, - &ct->tuplehash[!dir].tuple.src.u3)) { - if (ip6_route_me_harder(skb)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET6)) - ret = NF_DROP; -#endif - } - return ret; +static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_nat_ipv6_local_fn(ops, skb, in, out, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv6 = { @@ -174,9 +83,9 @@ static const struct nf_chain_type nft_chain_nat_ipv6 = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { - [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, - [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, - [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, + [NF_INET_PRE_ROUTING] = nft_nat_ipv6_in, + [NF_INET_POST_ROUTING] = nft_nat_ipv6_out, + [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn, [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, }, }; -- cgit v1.2.3 From 5e266fe7c046b107496a338839cfb6008aeddbd8 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 2 Sep 2014 16:42:21 +0200 Subject: netfilter: nf_tables: refactor rule deletion helper This helper function always schedule the rule to be removed in the following transaction. In follow-up patches, it is interesting to handle separately the logic of rule activation/disactivation from the transaction mechanism. So, this patch simply splits the original nf_tables_delrule_one() in two functions, allowing further control. While at it, for the sake of homigeneize the function naming scheme, let's rename nf_tables_delrule_one() to nft_delrule(). Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index deeb95fb7028..3664bab1c76c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1868,12 +1868,10 @@ err1: } static int -nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) +nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ if (nft_rule_is_active_next(ctx->net, rule)) { - if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) - return -ENOMEM; nft_rule_disactivate_next(ctx->net, rule); ctx->chain->use--; return 0; @@ -1881,13 +1879,31 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) return -ENOENT; } +static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) +{ + struct nft_trans *trans; + int err; + + trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule); + if (trans == NULL) + return -ENOMEM; + + err = nf_tables_delrule_deactivate(ctx, rule); + if (err < 0) { + nft_trans_destroy(trans); + return err; + } + + return 0; +} + static int nf_table_delrule_by_chain(struct nft_ctx *ctx) { struct nft_rule *rule; int err; list_for_each_entry(rule, &ctx->chain->rules, list) { - err = nf_tables_delrule_one(ctx, rule); + err = nft_delrule(ctx, rule); if (err < 0) return err; } @@ -1932,7 +1948,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(rule)) return PTR_ERR(rule); - err = nf_tables_delrule_one(&ctx, rule); + err = nft_delrule(&ctx, rule); } else { err = nf_table_delrule_by_chain(&ctx); } -- cgit v1.2.3 From c559879406c10087ea2eab0e1868f79eced12bf9 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 2 Sep 2014 16:42:23 +0200 Subject: netfilter: nf_tables: add helper to unregister chain hooks This patch adds a helper function to unregister chain hooks in the chain deletion path. Basically, a code factorization. The new function is useful in follow-up patches. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3664bab1c76c..90e349651178 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -127,6 +127,15 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +static void nf_tables_unregister_hooks(const struct nft_table *table, + const struct nft_chain *chain, + unsigned int hook_nops) +{ + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) + nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); +} + /* * Tables */ @@ -1157,11 +1166,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) { - nf_unregister_hooks(nft_base_chain(chain)->ops, - afi->nops); - } + nf_tables_unregister_hooks(table, chain, afi->nops); err1: nf_tables_chain_destroy(chain); return err; @@ -3368,11 +3373,9 @@ static int nf_tables_commit(struct sk_buff *skb) break; case NFT_MSG_DELCHAIN: nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && - trans->ctx.chain->flags & NFT_BASE_CHAIN) { - nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, - trans->ctx.afi->nops); - } + nf_tables_unregister_hooks(trans->ctx.table, + trans->ctx.chain, + trans->ctx.afi->nops); break; case NFT_MSG_NEWRULE: nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); @@ -3495,11 +3498,9 @@ static int nf_tables_abort(struct sk_buff *skb) } else { trans->ctx.table->use--; list_del_rcu(&trans->ctx.chain->list); - if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && - trans->ctx.chain->flags & NFT_BASE_CHAIN) { - nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, - trans->ctx.afi->nops); - } + nf_tables_unregister_hooks(trans->ctx.table, + trans->ctx.chain, + trans->ctx.afi->nops); } break; case NFT_MSG_DELCHAIN: -- cgit v1.2.3 From ce24b7217b60980ff0366dc6afbd5418db9972f2 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 2 Sep 2014 16:42:24 +0200 Subject: netfilter: nf_tables: rename nf_table_delrule_by_chain() For the sake of homogenize the function naming scheme, let's rename nf_table_delrule_by_chain() to nft_delrule_by_chain(). Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 90e349651178..eac4fab6d63b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1902,7 +1902,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) return 0; } -static int nf_table_delrule_by_chain(struct nft_ctx *ctx) +static int nft_delrule_by_chain(struct nft_ctx *ctx) { struct nft_rule *rule; int err; @@ -1955,12 +1955,12 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, err = nft_delrule(&ctx, rule); } else { - err = nf_table_delrule_by_chain(&ctx); + err = nft_delrule_by_chain(&ctx); } } else { list_for_each_entry(chain, &table->chains, list) { ctx.chain = chain; - err = nf_table_delrule_by_chain(&ctx); + err = nft_delrule_by_chain(&ctx); if (err < 0) break; } -- cgit v1.2.3 From 3045d76070abe725dbb7fd8ff39c27b820d5a7eb Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Tue, 2 Sep 2014 20:36:14 +0200 Subject: netfilter: nf_tables: add devgroup support in meta expresion Add devgroup support to let us match device group of a packets incoming or outgoing interface. Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ net/netfilter/nft_meta.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c9b6f00a3fb7..c000947d3f38 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -573,6 +573,8 @@ enum nft_exthdr_attributes { * @NFT_META_BRI_OIFNAME: packet output bridge interface name * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback * @NFT_META_CPU: cpu id through smp_processor_id() + * @NFT_META_IIFGROUP: packet input interface group + * @NFT_META_OIFGROUP: packet output interface group */ enum nft_meta_keys { NFT_META_LEN, @@ -596,6 +598,8 @@ enum nft_meta_keys { NFT_META_BRI_OIFNAME, NFT_META_PKTTYPE, NFT_META_CPU, + NFT_META_IIFGROUP, + NFT_META_OIFGROUP, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 843e099a962d..1e7c076ca63a 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -155,6 +155,16 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_CPU: dest->data[0] = smp_processor_id(); break; + case NFT_META_IIFGROUP: + if (in == NULL) + goto err; + dest->data[0] = in->group; + break; + case NFT_META_OIFGROUP: + if (out == NULL) + goto err; + dest->data[0] = out->group; + break; default: WARN_ON(1); goto err; @@ -228,6 +238,8 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #endif case NFT_META_PKTTYPE: case NFT_META_CPU: + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From 5fcf0cf6073d4adb22e34cd1d14a6318699625a9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 3 Sep 2014 00:02:49 +0300 Subject: ipvs: reduce stack usage for sockopt data Use union to reserve the required stack space for sockopt data which is less than the currently hardcoded value of 128. Now the tables for commands should be more readable. The checks added for readability are optimized by compiler, others warn at compile time if command uses too much stack or exceeds the storage of set_arglen and get_arglen. As Dan Carpenter points out, we can run for unprivileged user, so we can silent some error messages. Signed-off-by: Julian Anastasov CC: Dan Carpenter CC: Andrey Utkin CC: David Binderman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 111 ++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index fd3f444a4f96..bd2b208ba56c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2179,29 +2179,41 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) return 0; } +#define CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +struct ip_vs_svcdest_user { + struct ip_vs_service_user s; + struct ip_vs_dest_user d; }; +static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { + [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), + [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), + [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), + [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), +}; + +union ip_vs_set_arglen { + struct ip_vs_service_user field_IP_VS_SO_SET_ADD; + struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; + struct ip_vs_service_user field_IP_VS_SO_SET_DEL; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; + struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; + struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; + struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; + struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; +}; + +#define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) + static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { @@ -2239,7 +2251,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { struct net *net = sock_net(sk); int ret; - unsigned char arg[MAX_ARG_LEN]; + unsigned char arg[MAX_SET_ARGLEN]; struct ip_vs_service_user *usvc_compat; struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; @@ -2247,16 +2259,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); + BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) return -EINVAL; - if (len < 0 || len > MAX_ARG_LEN) - return -EINVAL; - if (len != set_arglen[SET_CMDID(cmd)]) { - pr_err("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); + if (len != set_arglen[CMDID(cmd)]) { + IP_VS_DBG(1, "set_ctl: len %u != %u\n", + len, set_arglen[CMDID(cmd)]); return -EINVAL; } @@ -2512,51 +2523,51 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) #endif } +static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { + [CMDID(IP_VS_SO_GET_VERSION)] = 64, + [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), + [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), + [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), + [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), + [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), + [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), +}; -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +union ip_vs_get_arglen { + char field_IP_VS_SO_GET_VERSION[64]; + struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; + struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; + struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; + struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; + struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; + struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; }; +#define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) + static int do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { - unsigned char arg[128]; + unsigned char arg[MAX_GET_ARGLEN]; int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); + BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) return -EINVAL; - if (*len < get_arglen[GET_CMDID(cmd)]) { - pr_err("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); + copylen = get_arglen[CMDID(cmd)]; + if (*len < (int) copylen) { + IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); return -EINVAL; } - copylen = get_arglen[GET_CMDID(cmd)]; - if (copylen > 128) - return -EINVAL; - if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; /* -- cgit v1.2.3 From c435201bede79735c23d6961ce369034f193e633 Mon Sep 17 00:00:00 2001 From: Bojan Prtvar Date: Mon, 8 Sep 2014 09:51:12 +0200 Subject: netfilter: xt_string: Remove unnecessary initialization of struct ts_state The skb_find_text() accepts uninitialized textsearch state variable. Signed-off-by: Bojan Prtvar Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_string.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index d3c48b14ab94..5699adb97652 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -29,7 +29,6 @@ string_mt(const struct sk_buff *skb, struct xt_action_param *par) struct ts_state state; bool invert; - memset(&state, 0, sizeof(struct ts_state)); invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, -- cgit v1.2.3 From ee01d5425634264089db74889c3547be13c3faef Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 2 Sep 2014 16:42:25 +0200 Subject: netfilter: nf_tables: add helpers to schedule objects deletion This patch refactor the code to schedule objects deletion. They are useful in follow-up patches. In order to be able to use these new helper functions in all the code, they are placed in the top of the file, with all the dependant functions and symbols. nft_rule_disactivate_next has been renamed to nft_rule_deactivate. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 363 ++++++++++++++++++++++-------------------- 1 file changed, 194 insertions(+), 169 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index eac4fab6d63b..3ce5cfa34935 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -136,6 +136,195 @@ static void nf_tables_unregister_hooks(const struct nft_table *table, nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); } +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static int nft_deltable(struct nft_ctx *ctx) +{ + int err; + + err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + + list_del_rcu(&ctx->table->list); + return err; +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static int nft_delchain(struct nft_ctx *ctx) +{ + int err; + + err = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; + + ctx->table->use--; + list_del_rcu(&ctx->chain->list); + + return err; +} + +static inline bool +nft_rule_is_active(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << net->nft.gencursor)) == 0; +} + +static inline int gencursor_next(struct net *net) +{ + return net->nft.gencursor+1 == 1 ? 1 : 0; +} + +static inline int +nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << gencursor_next(net))) == 0; +} + +static inline void +nft_rule_activate_next(struct net *net, struct nft_rule *rule) +{ + /* Now inactive, will be active in the future */ + rule->genmask = (1 << net->nft.gencursor); +} + +static inline void +nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) +{ + rule->genmask = (1 << gencursor_next(net)); +} + +static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) +{ + rule->genmask = 0; +} + +static int +nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) +{ + /* You cannot delete the same rule twice */ + if (nft_rule_is_active_next(ctx->net, rule)) { + nft_rule_deactivate_next(ctx->net, rule); + ctx->chain->use--; + return 0; + } + return -ENOENT; +} + +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; + + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return trans; +} + +static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) +{ + struct nft_trans *trans; + int err; + + trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule); + if (trans == NULL) + return -ENOMEM; + + err = nf_tables_delrule_deactivate(ctx, rule); + if (err < 0) { + nft_trans_destroy(trans); + return err; + } + + return 0; +} + +static int nft_delrule_by_chain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &ctx->chain->rules, list) { + err = nft_delrule(ctx, rule); + if (err < 0) + return err; + } + return 0; +} + +/* Internal set flag */ +#define NFT_SET_INACTIVE (1 << 15) + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) +{ + int err; + + err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del_rcu(&set->list); + ctx->table->use--; + + return err; +} + /* * Tables */ @@ -318,9 +507,6 @@ done: return skb->len; } -/* Internal table flags */ -#define NFT_TABLE_INACTIVE (1 << 15) - static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -452,21 +638,6 @@ err: return ret; } -static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); - if (trans == NULL) - return -ENOMEM; - - if (msg_type == NFT_MSG_NEWTABLE) - ctx->table->flags |= NFT_TABLE_INACTIVE; - - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - return 0; -} - static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -544,7 +715,7 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, struct nft_af_info *afi; struct nft_table *table; struct net *net = sock_net(skb->sk); - int family = nfmsg->nfgen_family, err; + int family = nfmsg->nfgen_family; struct nft_ctx ctx; afi = nf_tables_afinfo_lookup(net, family, false); @@ -560,12 +731,8 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, return -EBUSY; nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); - err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); - if (err < 0) - return err; - list_del_rcu(&table->list); - return 0; + return nft_deltable(&ctx); } static void nf_tables_table_destroy(struct nft_ctx *ctx) @@ -922,21 +1089,6 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, rcu_assign_pointer(chain->stats, newstats); } -static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); - if (trans == NULL) - return -ENOMEM; - - if (msg_type == NFT_MSG_NEWCHAIN) - ctx->chain->flags |= NFT_CHAIN_INACTIVE; - - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - return 0; -} - static void nf_tables_chain_destroy(struct nft_chain *chain) { BUG_ON(chain->use > 0); @@ -1183,7 +1335,6 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - int err; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -1204,13 +1355,8 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, return -EBUSY; nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); - err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); - if (err < 0) - return err; - table->use--; - list_del_rcu(&chain->list); - return 0; + return nft_delchain(&ctx); } /* @@ -1532,41 +1678,6 @@ err: return err; } -static inline bool -nft_rule_is_active(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & (1 << net->nft.gencursor)) == 0; -} - -static inline int gencursor_next(struct net *net) -{ - return net->nft.gencursor+1 == 1 ? 1 : 0; -} - -static inline int -nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & (1 << gencursor_next(net))) == 0; -} - -static inline void -nft_rule_activate_next(struct net *net, struct nft_rule *rule) -{ - /* Now inactive, will be active in the future */ - rule->genmask = (1 << net->nft.gencursor); -} - -static inline void -nft_rule_disactivate_next(struct net *net, struct nft_rule *rule) -{ - rule->genmask = (1 << gencursor_next(net)); -} - -static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) -{ - rule->genmask = 0; -} - static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { @@ -1692,21 +1803,6 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, - struct nft_rule *rule) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); - if (trans == NULL) - return NULL; - - nft_trans_rule(trans) = rule; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - - return trans; -} - #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -1828,7 +1924,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, err = -ENOMEM; goto err2; } - nft_rule_disactivate_next(net, old_rule); + nft_rule_deactivate_next(net, old_rule); chain->use--; list_add_tail_rcu(&rule->list, &old_rule->list); } else { @@ -1872,49 +1968,6 @@ err1: return err; } -static int -nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) -{ - /* You cannot delete the same rule twice */ - if (nft_rule_is_active_next(ctx->net, rule)) { - nft_rule_disactivate_next(ctx->net, rule); - ctx->chain->use--; - return 0; - } - return -ENOENT; -} - -static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) -{ - struct nft_trans *trans; - int err; - - trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule); - if (trans == NULL) - return -ENOMEM; - - err = nf_tables_delrule_deactivate(ctx, rule); - if (err < 0) { - nft_trans_destroy(trans); - return err; - } - - return 0; -} - -static int nft_delrule_by_chain(struct nft_ctx *ctx) -{ - struct nft_rule *rule; - int err; - - list_for_each_entry(rule, &ctx->chain->rules, list) { - err = nft_delrule(ctx, rule); - if (err < 0) - return err; - } - return 0; -} - static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2343,8 +2396,6 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) return 0; } -#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ - static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2419,26 +2470,6 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, - struct nft_set *set) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); - if (trans == NULL) - return -ENOMEM; - - if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { - nft_trans_set_id(trans) = - ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); - set->flags |= NFT_SET_INACTIVE; - } - nft_trans_set(trans) = set; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - - return 0; -} - static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2632,13 +2663,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (!list_empty(&set->bindings)) return -EBUSY; - err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); - if (err < 0) - return err; - - list_del_rcu(&set->list); - ctx.table->use--; - return 0; + return nft_delset(&ctx, set); } static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, -- cgit v1.2.3 From b9ac12ef099707f405d7478009564302d7ed8393 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 2 Sep 2014 16:42:26 +0200 Subject: netfilter: nf_tables: extend NFT_MSG_DELTABLE to support flushing the ruleset This patch extend the NFT_MSG_DELTABLE call to support flushing the entire ruleset. The options now are: * No family speficied, no table specified: flush all the ruleset. * Family specified, no table specified: flush all tables in the AF. * Family specified, table specified: flush the given table. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 72 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3ce5cfa34935..82374601577e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -707,6 +707,67 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return 0; } +static int nft_flush_table(struct nft_ctx *ctx) +{ + int err; + struct nft_chain *chain, *nc; + struct nft_set *set, *ns; + + list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { + ctx->chain = chain; + + err = nft_delrule_by_chain(ctx); + if (err < 0) + goto out; + + err = nft_delchain(ctx); + if (err < 0) + goto out; + } + + list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { + if (set->flags & NFT_SET_ANONYMOUS && + !list_empty(&set->bindings)) + continue; + + err = nft_delset(ctx, set); + if (err < 0) + goto out; + } + + err = nft_deltable(ctx); +out: + return err; +} + +static int nft_flush(struct nft_ctx *ctx, int family) +{ + struct nft_af_info *afi; + struct nft_table *table, *nt; + const struct nlattr * const *nla = ctx->nla; + int err = 0; + + list_for_each_entry(afi, &ctx->net->nft.af_info, list) { + if (family != AF_UNSPEC && afi->family != family) + continue; + + ctx->afi = afi; + list_for_each_entry_safe(table, nt, &afi->tables, list) { + if (nla[NFTA_TABLE_NAME] && + nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) + continue; + + ctx->table = table; + + err = nft_flush_table(ctx); + if (err < 0) + goto out; + } + } +out: + return err; +} + static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -718,6 +779,10 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, int family = nfmsg->nfgen_family; struct nft_ctx ctx; + nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) + return nft_flush(&ctx, family); + afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) return PTR_ERR(afi); @@ -727,12 +792,11 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(table); if (table->flags & NFT_TABLE_INACTIVE) return -ENOENT; - if (table->use > 0) - return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + ctx.afi = afi; + ctx.table = table; - return nft_deltable(&ctx); + return nft_flush_table(&ctx); } static void nf_tables_table_destroy(struct nft_ctx *ctx) -- cgit v1.2.3 From e42eff8a32f8b7bde88ea3c5a56391407cbe84f3 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:14 +0200 Subject: netfilter: nft_nat: include a flag attribute Both SNAT and DNAT (and the upcoming masquerade) can have additional configuration parameters, such as port randomization and NAT addressing persistence. We can cover these scenarios by simply adding a flag attribute for userspace to fill when needed. The flags to use are defined in include/uapi/linux/netfilter/nf_nat.h: NF_NAT_RANGE_MAP_IPS NF_NAT_RANGE_PROTO_SPECIFIED NF_NAT_RANGE_PROTO_RANDOM NF_NAT_RANGE_PERSISTENT NF_NAT_RANGE_PROTO_RANDOM_FULLY NF_NAT_RANGE_PROTO_RANDOM_ALL The caller must take care of not messing up with the flags, as they are added unconditionally to the final resulting nf_nat_range. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_nat.h | 5 +++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_nat.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index 1ad3659102b6..0880781ad7b6 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -13,6 +13,11 @@ #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) +#define NF_NAT_RANGE_MASK \ + (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ + NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ + NF_NAT_RANGE_PROTO_RANDOM_FULLY) + struct nf_nat_ipv4_range { unsigned int flags; __be32 min_ip; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c000947d3f38..6022c6e6be18 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -785,6 +785,7 @@ enum nft_nat_types { * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers) * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_NAT_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) */ enum nft_nat_attributes { NFTA_NAT_UNSPEC, @@ -794,6 +795,7 @@ enum nft_nat_attributes { NFTA_NAT_REG_ADDR_MAX, NFTA_NAT_REG_PROTO_MIN, NFTA_NAT_REG_PROTO_MAX, + NFTA_NAT_FLAGS, __NFTA_NAT_MAX }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 79ff58cd36dc..799550b476fb 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -33,6 +33,7 @@ struct nft_nat { enum nft_registers sreg_proto_max:8; enum nf_nat_manip_type type:8; u8 family; + u16 flags; }; static void nft_nat_eval(const struct nft_expr *expr, @@ -71,6 +72,8 @@ static void nft_nat_eval(const struct nft_expr *expr, range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } + range.flags |= priv->flags; + data[NFT_REG_VERDICT].verdict = nf_nat_setup_info(ct, &range, priv->type); } @@ -82,6 +85,7 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, }; static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -149,6 +153,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else priv->sreg_proto_max = priv->sreg_proto_min; + if (tb[NFTA_NAT_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + return 0; } @@ -183,6 +193,12 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) htonl(priv->sreg_proto_max))) goto nla_put_failure; } + + if (priv->flags != 0) { + if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + } + return 0; nla_put_failure: -- cgit v1.2.3 From c55fbbb4a730e3d6e1727b7de08b39e6cd847fad Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 8 Sep 2014 14:11:45 +0200 Subject: netfilter: ebtables: create audit records for replaces This is already done for x_tables (family AF_INET and AF_INET6), let's do it for AF_BRIDGE also. Signed-off-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 6d69631b9f4d..d9a8c05d995d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -26,6 +26,7 @@ #include #include #include +#include #include /* needed for logical [in,out]-dev filtering */ #include "../br_private.h" @@ -1058,6 +1059,20 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, vfree(table); vfree(counterstmp); + +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + repl->name, AF_BRIDGE, repl->nentries); + audit_log_end(ab); + } + } +#endif return ret; free_unlock: -- cgit v1.2.3 From 8dd33cc93ec92b8460ed2ad98c6db39276f6a72b Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:33 +0200 Subject: netfilter: nf_nat: generalize IPv4 masquerading support for nf_tables Let's refactor the code so we can reach the masquerade functionality from outside the xt context (ie. nftables). The patch includes the addition of an atomic counter to the masquerade notifier: the stuff to be done by the notifier is the same for xt and nftables. Therefore, only one notification handler is needed. This factorization only involves IPv4; a similar patch follows to handle IPv6. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_masquerade.h | 14 +++ net/ipv4/netfilter/Kconfig | 7 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_MASQUERADE.c | 108 ++--------------- net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 153 +++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 99 deletions(-) create mode 100644 include/net/netfilter/ipv4/nf_nat_masquerade.h create mode 100644 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c (limited to 'net') diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h new file mode 100644 index 000000000000..a9c001c646da --- /dev/null +++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -0,0 +1,14 @@ +#ifndef _NF_NAT_MASQUERADE_IPV4_H_ +#define _NF_NAT_MASQUERADE_IPV4_H_ + +#include + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range *range, + const struct net_device *out); + +void nf_nat_masquerade_ipv4_register_notifier(void); +void nf_nat_masquerade_ipv4_unregister_notifier(void); + +#endif /*_NF_NAT_MASQUERADE_IPV4_H_ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fb173126f03d..4be3e541350e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -184,8 +184,15 @@ config NF_NAT_IPV4 if NF_NAT_IPV4 +config NF_NAT_MASQUERADE_IPV4 + tristate "IPv4 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection). + config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" + select NF_NAT_MASQUERADE_IPV4 default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 33001621465b..42056b2fd0e3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 00352ce0f0de..da7f02a0b868 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -22,6 +22,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -46,103 +47,17 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_conn *ct; - struct nf_conn_nat *nat; - enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; + struct nf_nat_range range; const struct nf_nat_ipv4_multi_range_compat *mr; - const struct rtable *rt; - __be32 newsrc, nh; - - NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); - - ct = nf_ct_get(skb, &ctinfo); - nat = nfct_nat(ct); - - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - - /* Source address is 0.0.0.0 - locally generated packet that is - * probably not supposed to be masqueraded. - */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) - return NF_ACCEPT; mr = par->targinfo; - rt = skb_rtable(skb); - nh = rt_nexthop(rt, ip_hdr(skb)->daddr); - newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); - if (!newsrc) { - pr_info("%s ate my IP address\n", par->out->name); - return NF_DROP; - } - - nat->masq_index = par->out->ifindex; - - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newsrc; - newrange.max_addr.ip = newsrc; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; + range.flags = mr->range[0].flags; + range.min_proto = mr->range[0].min; + range.max_proto = mr->range[0].max; - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); + return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); } -static int -device_cmp(struct nf_conn *i, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - if (nf_ct_l3num(i) != NFPROTO_IPV4) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) { - /* Device was downed. Search entire table for - conntracks which were associated with that device, - and forget them. */ - NF_CT_ASSERT(dev->ifindex != 0); - - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - } - - return NOTIFY_DONE; -} - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - struct netdev_notifier_info info; - - netdev_notifier_info_init(&info, dev); - return masq_device_event(this, event, &info); -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV4, @@ -160,12 +75,8 @@ static int __init masquerade_tg_init(void) ret = xt_register_target(&masquerade_tg_reg); - if (ret == 0) { - /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); - /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); - } + if (ret == 0) + nf_nat_masquerade_ipv4_register_notifier(); return ret; } @@ -173,8 +84,7 @@ static int __init masquerade_tg_init(void) static void __exit masquerade_tg_exit(void) { xt_unregister_target(&masquerade_tg_reg); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); + nf_nat_masquerade_ipv4_unregister_notifier(); } module_init(masquerade_tg_init); diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c new file mode 100644 index 000000000000..c6eb42100e9a --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -0,0 +1,153 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range *range, + const struct net_device *out) +{ + struct nf_conn *ct; + struct nf_conn_nat *nat; + enum ip_conntrack_info ctinfo; + struct nf_nat_range newrange; + const struct rtable *rt; + __be32 newsrc, nh; + + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + + ct = nf_ct_get(skb, &ctinfo); + nat = nfct_nat(ct); + + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + /* Source address is 0.0.0.0 - locally generated packet that is + * probably not supposed to be masqueraded. + */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) + return NF_ACCEPT; + + rt = skb_rtable(skb); + nh = rt_nexthop(rt, ip_hdr(skb)->daddr); + newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); + if (!newsrc) { + pr_info("%s ate my IP address\n", out->name); + return NF_DROP; + } + + nat->masq_index = out->ifindex; + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newsrc; + newrange.max_addr.ip = newsrc; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); + +static int device_cmp(struct nf_conn *i, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + if (nf_ct_l3num(i) != NFPROTO_IPV4) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + * conntracks which were associated with that device, + * and forget them. + */ + NF_CT_ASSERT(dev->ifindex != 0); + + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); + +void nf_nat_masquerade_ipv4_register_notifier(void) +{ + /* check if the notifier was already set */ + if (atomic_inc_return(&masquerade_notifier_refcount) > 1) + return; + + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); + +void nf_nat_masquerade_ipv4_unregister_notifier(void) +{ + /* check if the notifier still has clients */ + if (atomic_dec_return(&masquerade_notifier_refcount) > 0) + return; + + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); -- cgit v1.2.3 From be6b635cd674add9410efa9ac6f03e0040848b12 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:49 +0200 Subject: netfilter: nf_nat: generalize IPv6 masquerading support for nf_tables Let's refactor the code so we can reach the masquerade functionality from outside the xt context (ie. nftables). The patch includes the addition of an atomic counter to the masquerade notifier: the stuff to be done by the notifier is the same for xt and nftables. Therefore, only one notification handler is needed. This factorization only involves IPv6; a similar patch exists to handle IPv4. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_nat_masquerade.h | 10 +++ net/ipv6/netfilter/Kconfig | 7 ++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/ip6t_MASQUERADE.c | 76 ++-------------- net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 120 +++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 71 deletions(-) create mode 100644 include/net/netfilter/ipv6/nf_nat_masquerade.h create mode 100644 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c (limited to 'net') diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h new file mode 100644 index 000000000000..0a13396cd390 --- /dev/null +++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -0,0 +1,10 @@ +#ifndef _NF_NAT_MASQUERADE_IPV6_H_ +#define _NF_NAT_MASQUERADE_IPV6_H_ + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + const struct net_device *out); +void nf_nat_masquerade_ipv6_register_notifier(void); +void nf_nat_masquerade_ipv6_unregister_notifier(void); + +#endif /* _NF_NAT_MASQUERADE_IPV6_H_ */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ac93df16f5af..6c8cfec6836a 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -246,8 +246,15 @@ config NF_NAT_IPV6 if NF_NAT_IPV6 +config NF_NAT_MASQUERADE_IPV6 + tristate "IPv6 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection) for IPv6. + config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" + select NF_NAT_MASQUERADE_IPV6 help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c0b263104ed2..89a0bd751f82 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o +obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o # defrag nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 3e4e92d5e157..7f9f45d829d2 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -19,33 +19,12 @@ #include #include #include +#include static unsigned int masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; - enum ip_conntrack_info ctinfo; - struct in6_addr src; - struct nf_conn *ct; - struct nf_nat_range newrange; - - ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - - if (ipv6_dev_get_saddr(dev_net(par->out), par->out, - &ipv6_hdr(skb)->daddr, 0, &src) < 0) - return NF_DROP; - - nfct_nat(ct)->masq_index = par->out->ifindex; - - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = src; - newrange.max_addr.in6 = src; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); + return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out); } static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) @@ -57,48 +36,6 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) return 0; } -static int device_cmp(struct nf_conn *ct, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(ct); - - if (!nat) - return 0; - if (nf_ct_l3num(ct) != NFPROTO_IPV6) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - - return NOTIFY_DONE; -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; - - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); -} - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, @@ -115,17 +52,14 @@ static int __init masquerade_tg6_init(void) int err; err = xt_register_target(&masquerade_tg6_reg); - if (err == 0) { - register_netdevice_notifier(&masq_dev_notifier); - register_inet6addr_notifier(&masq_inet_notifier); - } + if (err == 0) + nf_nat_masquerade_ipv6_register_notifier(); return err; } static void __exit masquerade_tg6_exit(void) { - unregister_inet6addr_notifier(&masq_inet_notifier); - unregister_netdevice_notifier(&masq_dev_notifier); + nf_nat_masquerade_ipv6_unregister_notifier(); xt_unregister_target(&masquerade_tg6_reg); } diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c new file mode 100644 index 000000000000..7745609665cd --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + const struct net_device *out) +{ + enum ip_conntrack_info ctinfo; + struct in6_addr src; + struct nf_conn *ct; + struct nf_nat_range newrange; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + if (ipv6_dev_get_saddr(dev_net(out), out, + &ipv6_hdr(skb)->daddr, 0, &src) < 0) + return NF_DROP; + + nfct_nat(ct)->masq_index = out->ifindex; + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = src; + newrange.max_addr.in6 = src; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); + +static int device_cmp(struct nf_conn *ct, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); + +void nf_nat_masquerade_ipv6_register_notifier(void) +{ + /* check if the notifier is already set */ + if (atomic_inc_return(&masquerade_notifier_refcount) > 1) + return; + + register_netdevice_notifier(&masq_dev_notifier); + register_inet6addr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); + +void nf_nat_masquerade_ipv6_unregister_notifier(void) +{ + /* check if the notifier still has clients */ + if (atomic_dec_return(&masquerade_notifier_refcount) > 0) + return; + + unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_netdevice_notifier(&masq_dev_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); -- cgit v1.2.3 From 9ba1f726bec090399eb9bb9157eb32dedc8e8c45 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Mon, 8 Sep 2014 13:45:00 +0200 Subject: netfilter: nf_tables: add new nft_masq expression The nft_masq expression is intended to perform NAT in the masquerade flavour. We decided to have the masquerade functionality in a separated expression other than nft_nat. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_masq.h | 16 ++++++ include/uapi/linux/netfilter/nf_tables.h | 11 ++++ net/ipv4/netfilter/Kconfig | 6 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_masq_ipv4.c | 89 ++++++++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 +++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 89 ++++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 9 ++++ net/netfilter/Makefile | 1 + net/netfilter/nft_masq.c | 59 +++++++++++++++++++++ 11 files changed, 288 insertions(+) create mode 100644 include/net/netfilter/nft_masq.h create mode 100644 net/ipv4/netfilter/nft_masq_ipv4.c create mode 100644 net/ipv6/netfilter/nft_masq_ipv6.c create mode 100644 net/netfilter/nft_masq.c (limited to 'net') diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h new file mode 100644 index 000000000000..c72729f954f4 --- /dev/null +++ b/include/net/netfilter/nft_masq.h @@ -0,0 +1,16 @@ +#ifndef _NFT_MASQ_H_ +#define _NFT_MASQ_H_ + +struct nft_masq { + u32 flags; +}; + +extern const struct nla_policy nft_masq_policy[]; + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr); + +#endif /* _NFT_MASQ_H_ */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6022c6e6be18..eeec0ae845ef 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -800,4 +800,15 @@ enum nft_nat_attributes { }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) +/** + * enum nft_masq_attributes - nf_tables masquerade expression attributes + * + * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_masq_attributes { + NFTA_MASQ_FLAGS, + __NFTA_MASQ_MAX +}; +#define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4be3e541350e..8dd3d9f19d82 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -190,6 +190,12 @@ config NF_NAT_MASQUERADE_IPV4 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection). +config NFT_MASQ_IPV4 + tristate "IPv4 masquerading support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV4 + config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV4 diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 42056b2fd0e3..7d019aefb0ed 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c new file mode 100644 index 000000000000..6ea1d207b6a5 --- /dev/null +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_masq_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static int nft_masq_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + int err; + + err = nft_masq_init(ctx, expr, tb); + if (err < 0) + return err; + + nf_nat_masquerade_ipv4_register_notifier(); + return 0; +} + +static void nft_masq_ipv4_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_nat_masquerade_ipv4_unregister_notifier(); +} + +static struct nft_expr_type nft_masq_ipv4_type; +static const struct nft_expr_ops nft_masq_ipv4_ops = { + .type = &nft_masq_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv4_eval, + .init = nft_masq_ipv4_init, + .destroy = nft_masq_ipv4_destroy, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "masq", + .ops = &nft_masq_ipv4_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv4_module_init(void) +{ + return nft_register_expr(&nft_masq_ipv4_type); +} + +static void __exit nft_masq_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv4_type); +} + +module_init(nft_masq_ipv4_module_init); +module_exit(nft_masq_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6c8cfec6836a..24c535f66df0 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -252,6 +252,12 @@ config NF_NAT_MASQUERADE_IPV6 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection) for IPv6. +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 89a0bd751f82..482c4dff273f 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o +obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c new file mode 100644 index 000000000000..4e51334ef6b7 --- /dev/null +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_masq_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static int nft_masq_ipv6_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + int err; + + err = nft_masq_init(ctx, expr, tb); + if (err < 0) + return err; + + nf_nat_masquerade_ipv6_register_notifier(); + return 0; +} + +static void nft_masq_ipv6_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_nat_masquerade_ipv6_unregister_notifier(); +} + +static struct nft_expr_type nft_masq_ipv6_type; +static const struct nft_expr_ops nft_masq_ipv6_ops = { + .type = &nft_masq_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv6_eval, + .init = nft_masq_ipv6_init, + .destroy = nft_masq_ipv6_destroy, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "masq", + .ops = &nft_masq_ipv6_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv6_module_init(void) +{ + return nft_register_expr(&nft_masq_ipv6_type); +} + +static void __exit nft_masq_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv6_type); +} + +module_init(nft_masq_ipv6_module_init); +module_exit(nft_masq_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ad751fe2e82b..37428723394f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -496,6 +496,15 @@ config NFT_LIMIT This option adds the "limit" expression that you can use to ratelimit rule matchings. +config NFT_MASQ + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables masquerade support" + help + This option adds the "masquerade" expression that you can use + to perform NAT in the masquerade flavour. + config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 8308624a406a..0637792f6faf 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o +obj-$(CONFIG_NFT_MASQ) += nft_masq.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c new file mode 100644 index 000000000000..6637bab00567 --- /dev/null +++ b/net/netfilter/nft_masq.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_masq_policy); + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_masq *priv = nft_expr_priv(expr); + + if (tb[NFTA_MASQ_FLAGS] == NULL) + return 0; + + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_masq_init); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_masq *priv = nft_expr_priv(expr); + + if (priv->flags == 0) + return 0; + + if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_masq_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); -- cgit v1.2.3 From 49a601589caaf0e93194c0cc9b4ecddbe75dd2d5 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Fri, 5 Sep 2014 15:09:03 +0200 Subject: net/ipv4: bind ip_nonlocal_bind to current netns net.ipv4.ip_nonlocal_bind sysctl was global to all network namespaces. This patch allows to set a different value for each network namespace. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/ip.h | 2 -- include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 6 +----- net/ipv4/ping.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv6/af_inet6.c | 2 +- net/sctp/protocol.c | 2 +- 7 files changed, 12 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index c8fd6112bd0b..14bfc8e1bcf9 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -229,8 +229,6 @@ static inline int inet_is_local_reserved_port(struct net *net, int port) } #endif -extern int sysctl_ip_nonlocal_bind; - /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index aec5e12f9f19..24945cefc4fd 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,6 +76,7 @@ struct netns_ipv4 { int sysctl_tcp_ecn; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_nonlocal_bind; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d156b3c5f363..b537bd94906c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -418,10 +418,6 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -/* It is off by default, see below. */ -int sysctl_ip_nonlocal_bind __read_mostly; -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index a3c59a077a5f..57f7c9804139 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((sysctl_ip_nonlocal_bind == 0 && + if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && isk->freebind == 0 && isk->transparent == 0 && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 45d156dacd61..1599966f4639 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -285,13 +285,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, - { - .procname = "ip_nonlocal_bind", - .data = &sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_syn_retries", .data = &sysctl_tcp_syn_retries, @@ -848,6 +841,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b9393e6a21fe..e4865a3ebe1d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -302,7 +302,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6240834f4b95..9d2c6c9facb6 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -366,7 +366,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && - !sysctl_ip_nonlocal_bind) + !net->ipv4.sysctl_ip_nonlocal_bind) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) -- cgit v1.2.3 From 0f49579a39533bf839b2af807b094fc652f9e49b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:28 +0200 Subject: bridge: switch order of rx_handler reg and upper dev link The thing is that netdev_master_upper_dev_link calls call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev). That generates rtnl link message and during that, rtnl_link_ops->fill_slave_info is called. But with current ordering, rx_handler and IFF_BRIDGE_PORT are not set yet so there would have to be check for that in fill_slave_info callback. Resolve this by reordering to similar what bonding and team does to avoid the check. Also add removal of IFF_BRIDGE_PORT flag into error path. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_if.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 078d336a1f37..a9f54a9b6690 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -252,12 +252,12 @@ static void del_nbp(struct net_bridge_port *p) br_fdb_delete_by_port(br, p, 1); nbp_update_port_count(br); + netdev_upper_dev_unlink(dev, br->dev); + dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); - netdev_upper_dev_unlink(dev, br->dev); - br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); @@ -476,16 +476,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err3; - err = netdev_master_upper_dev_link(dev, br->dev); + err = netdev_rx_handler_register(dev, br_handle_frame, p); if (err) goto err4; - err = netdev_rx_handler_register(dev, br_handle_frame, p); + dev->priv_flags |= IFF_BRIDGE_PORT; + + err = netdev_master_upper_dev_link(dev, br->dev); if (err) goto err5; - dev->priv_flags |= IFF_BRIDGE_PORT; - dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -520,7 +520,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; err5: - netdev_upper_dev_unlink(dev, br->dev); + dev->priv_flags &= ~IFF_BRIDGE_PORT; + netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: -- cgit v1.2.3 From ced8283f90b88bbf7ec9a6b869586b611167394e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:29 +0200 Subject: bridge: implement rtnl_link_ops->get_slave_size and rtnl_link_ops->fill_slave_info Allow rtnetlink users to get port info in IFLA_INFO_SLAVE_DATA attr Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index cb5fcf62f663..80d23471743f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -461,6 +461,19 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, return register_netdevice(dev); } +static int br_port_fill_slave_info(struct sk_buff *skb, + const struct net_device *brdev, + const struct net_device *dev) +{ + return br_port_fill_attrs(skb, br_port_get_rtnl(dev)); +} + +static size_t br_port_get_slave_size(const struct net_device *brdev, + const struct net_device *dev) +{ + return br_port_info_size(); +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -485,12 +498,14 @@ static struct rtnl_af_ops br_af_ops = { }; struct rtnl_link_ops br_link_ops __read_mostly = { - .kind = "bridge", - .priv_size = sizeof(struct net_bridge), - .setup = br_dev_setup, - .validate = br_validate, - .newlink = br_dev_newlink, - .dellink = br_dev_delete, + .kind = "bridge", + .priv_size = sizeof(struct net_bridge), + .setup = br_dev_setup, + .validate = br_validate, + .newlink = br_dev_newlink, + .dellink = br_dev_delete, + .get_slave_size = br_port_get_slave_size, + .fill_slave_info = br_port_fill_slave_info, }; int __init br_netlink_init(void) -- cgit v1.2.3 From 3ac636b8591c37bb5028814a4ebd41d263b56181 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:30 +0200 Subject: bridge: implement rtnl_link_ops->slave_changelink Allow rtnetlink users to set port info via IFLA_INFO_SLAVE_DATA attr Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 80d23471743f..05aeea1222a7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -276,7 +276,7 @@ static int br_afspec(struct net_bridge *br, return err; } -static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { +static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, @@ -382,7 +382,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh) if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { err = nla_parse_nested(tb, IFLA_BRPORT_MAX, - protinfo, ifla_brport_policy); + protinfo, br_port_policy); if (err) return err; @@ -461,6 +461,16 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, return register_netdevice(dev); } +static int br_port_slave_changelink(struct net_device *brdev, + struct net_device *dev, + struct nlattr *tb[], + struct nlattr *data[]) +{ + if (!data) + return 0; + return br_setport(br_port_get_rtnl(dev), data); +} + static int br_port_fill_slave_info(struct sk_buff *skb, const struct net_device *brdev, const struct net_device *dev) @@ -504,6 +514,10 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .validate = br_validate, .newlink = br_dev_newlink, .dellink = br_dev_delete, + + .slave_maxtype = IFLA_BRPORT_MAX, + .slave_policy = br_port_policy, + .slave_changelink = br_port_slave_changelink, .get_slave_size = br_port_get_slave_size, .fill_slave_info = br_port_fill_slave_info, }; -- cgit v1.2.3 From e5c3ea5c668033b303e7ac835d7d91da32d97958 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:31 +0200 Subject: bridge: implement rtnl_link_ops->get_size and rtnl_link_ops->fill_info Allow rtnetlink users to get bridge master info in IFLA_INFO_DATA attr This initial part implements forward_delay, hello_time, max_age options. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 12 ++++++++++++ net/bridge/br_netlink.c | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ff957604a721..c80f95f6ee78 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -215,6 +215,18 @@ enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_NONE, }; +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05aeea1222a7..bcac09fe2ac8 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -484,6 +484,29 @@ static size_t br_port_get_slave_size(const struct net_device *brdev, return br_port_info_size(); } +static size_t br_get_size(const struct net_device *brdev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + 0; +} + +static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) +{ + struct net_bridge *br = netdev_priv(brdev); + u32 forward_delay = jiffies_to_clock_t(br->forward_delay); + u32 hello_time = jiffies_to_clock_t(br->hello_time); + u32 age_time = jiffies_to_clock_t(br->max_age); + + if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || + nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + return -EMSGSIZE; + + return 0; +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -514,6 +537,8 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .validate = br_validate, .newlink = br_dev_newlink, .dellink = br_dev_delete, + .get_size = br_get_size, + .fill_info = br_fill_info, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, -- cgit v1.2.3 From 13323516172178ff8184855ee4bc66d46fd89619 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:32 +0200 Subject: bridge: implement rtnl_link_ops->changelink Allow rtnetlink users to set bridge master info via IFLA_INFO_DATA attr This initial part implements forward_delay, hello_time, max_age options. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index bcac09fe2ac8..7c97a261afae 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -484,6 +484,42 @@ static size_t br_port_get_slave_size(const struct net_device *brdev, return br_port_info_size(); } +static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { + [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, + [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, + [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, +}; + +static int br_changelink(struct net_device *brdev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct net_bridge *br = netdev_priv(brdev); + int err; + + if (!data) + return 0; + + if (data[IFLA_BR_FORWARD_DELAY]) { + err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY])); + if (err) + return err; + } + + if (data[IFLA_BR_HELLO_TIME]) { + err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME])); + if (err) + return err; + } + + if (data[IFLA_BR_MAX_AGE]) { + err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE])); + if (err) + return err; + } + + return 0; +} + static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ @@ -534,8 +570,11 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, + .maxtype = IFLA_BRPORT_MAX, + .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, + .changelink = br_changelink, .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, -- cgit v1.2.3 From 5aaa62d608464bedb30afc62e5073629de505afb Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Sat, 6 Sep 2014 13:08:08 +0300 Subject: bridge: Cleanup of unncessary check. This patch removes an unncessary check in the br_afspec() method of br_netlink.c. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 7c97a261afae..90a91e137acc 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -257,9 +257,6 @@ static int br_afspec(struct net_bridge *br, } else err = br_vlan_add(br, vinfo->vid, vinfo->flags); - if (err) - break; - break; case RTM_DELLINK: -- cgit v1.2.3 From e403aded79a1bfb610adc53490ded8d2058f9daf Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 6 Sep 2014 19:06:11 +0800 Subject: openvswitch: change the data type of error status to atomic_long_t Change the date type of error status from u64 to atomic_long_t, and use atomic operation, then remove the lock which is used to protect the error status. The operation of atomic maybe faster than spin lock. Cc: Pravin Shelar Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 25 ++++++++----------------- net/openvswitch/vport.h | 10 ++++------ 2 files changed, 12 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6d8f2ec481d9..f7e63f9df7b9 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -148,8 +148,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return ERR_PTR(-ENOMEM); } - spin_lock_init(&vport->stats_lock); - return vport; } @@ -268,14 +266,10 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) * netdev-stats can be directly read over netlink-ioctl. */ - spin_lock_bh(&vport->stats_lock); - - stats->rx_errors = vport->err_stats.rx_errors; - stats->tx_errors = vport->err_stats.tx_errors; - stats->tx_dropped = vport->err_stats.tx_dropped; - stats->rx_dropped = vport->err_stats.rx_dropped; - - spin_unlock_bh(&vport->stats_lock); + stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); + stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); + stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); + stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); for_each_possible_cpu(i) { const struct pcpu_sw_netstats *percpu_stats; @@ -495,27 +489,24 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) static void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) { - spin_lock(&vport->stats_lock); - switch (err_type) { case VPORT_E_RX_DROPPED: - vport->err_stats.rx_dropped++; + atomic_long_inc(&vport->err_stats.rx_dropped); break; case VPORT_E_RX_ERROR: - vport->err_stats.rx_errors++; + atomic_long_inc(&vport->err_stats.rx_errors); break; case VPORT_E_TX_DROPPED: - vport->err_stats.tx_dropped++; + atomic_long_inc(&vport->err_stats.tx_dropped); break; case VPORT_E_TX_ERROR: - vport->err_stats.tx_errors++; + atomic_long_inc(&vport->err_stats.tx_errors); break; } - spin_unlock(&vport->stats_lock); } static void free_vport_rcu(struct rcu_head *rcu) diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 35f89d84b45e..0d95b9f5f9c4 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -62,10 +62,10 @@ int ovs_vport_send(struct vport *, struct sk_buff *); /* The following definitions are for implementers of vport devices: */ struct vport_err_stats { - u64 rx_dropped; - u64 rx_errors; - u64 tx_dropped; - u64 tx_errors; + atomic_long_t rx_dropped; + atomic_long_t rx_errors; + atomic_long_t tx_dropped; + atomic_long_t tx_errors; }; /** * struct vport_portids - array of netlink portids of a vport. @@ -93,7 +93,6 @@ struct vport_portids { * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @stats_lock: Protects @err_stats; * @err_stats: Points to error statistics used and maintained by vport */ struct vport { @@ -108,7 +107,6 @@ struct vport { struct pcpu_sw_netstats __percpu *percpu_stats; - spinlock_t stats_lock; struct vport_err_stats err_stats; }; -- cgit v1.2.3 From 196332f5a1e5c3ec7171742fce83d03795abf120 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 9 Sep 2014 16:21:46 -0700 Subject: Bluetooth: Fix allowing SMP Signing info PDU If the remote side is not distributing its IRK but is distributing the CSRK the next PDU after master identification is the Signing Information. This patch fixes a missing SMP_ALLOW_CMD() for this in the smp_cmd_master_ident() function. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 25c9040e0b12..dc575aba2e65 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1324,6 +1324,8 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) SMP_DISALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); + else if (smp->remote_key_dist & SMP_DIST_SIGN) + SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*rp)); -- cgit v1.2.3 From ca777eff51f7fbaebd954e645d8ecb781a906b4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Sep 2014 08:06:07 -0700 Subject: tcp: remove dst refcount false sharing for prequeue mode Alexander Duyck reported high false sharing on dst refcount in tcp stack when prequeue is used. prequeue is the mechanism used when a thread is blocked in recvmsg()/read() on a TCP socket, using a blocking model rather than select()/poll()/epoll() non blocking one. We already try to use RCU in input path as much as possible, but we were forced to take a refcount on the dst when skb escaped RCU protected region. When/if the user thread runs on different cpu, dst_release() will then touch dst refcount again. Commit 093162553c33 (tcp: force a dst refcount when prequeue packet) was an example of a race fix. It turns out the only remaining usage of skb->dst for a packet stored in a TCP socket prequeue is IP early demux. We can add a logic to detect when IP early demux is probably going to use skb->dst. Because we do an optimistic check rather than duplicate existing logic, we need to guard inet_sk_rx_dst_set() and inet6_sk_rx_dst_set() from using a NULL dst. Many thanks to Alexander for providing a nice bug report, git bisection, and reproducer. Tested using Alexander script on a 40Gb NIC, 8 RX queues. Hosts have 24 cores, 48 hyper threads. echo 0 >/proc/sys/net/ipv4/tcp_autocorking for i in `seq 0 47` do for j in `seq 0 2` do netperf -H $DEST -t TCP_STREAM -l 1000 \ -c -C -T $i,$i -P 0 -- \ -m 64 -s 64K -D & done done Before patch : ~6Mpps and ~95% cpu usage on receiver After patch : ~9Mpps and ~35% cpu usage on receiver. Signed-off-by: Eric Dumazet Reported-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 20 ++++++++++++++++---- net/ipv6/tcp_ipv6.c | 15 +++++++++------ 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3f9bc3f0bba0..7881b96d2b72 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1559,7 +1559,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) skb_queue_len(&tp->ucopy.prequeue) == 0) return false; - skb_dst_force(skb); + /* Before escaping RCU protected region, we need to take care of skb + * dst. Prequeue is only enabled for established sockets. + * For such sockets, we might need the skb dst only to set sk->sk_rx_dst + * Instead of doing full sk_rx_dst validity here, let's perform + * an optimistic check. + */ + if (likely(sk->sk_rx_dst)) + skb_dst_drop(skb); + else + skb_dst_force(skb); + __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; if (tp->ucopy.memory > sk->sk_rcvbuf) { @@ -1765,9 +1775,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (dst) { + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } } EXPORT_SYMBOL(inet_sk_rx_dst_set); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5b3c70ff7a72..1835480336ac 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -93,13 +93,16 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + if (dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + } } static void tcp_v6_hash(struct sock *sk) -- cgit v1.2.3 From 286aad3c4014ca825c447e07e24f8929e6d266d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:49 +0200 Subject: net: bpf: be friendly to kmemcheck Reported by Mikulas Patocka, kmemcheck currently barks out a false positive since we don't have special kmemcheck annotation for bitfields used in bpf_prog structure. We currently have jited:1, len:31 and thus when accessing len while CONFIG_KMEMCHECK enabled, kmemcheck throws a warning that we're reading uninitialized memory. As we don't need the whole bit universe for pages member, we can just split it to u16 and use a bool flag for jited instead of a bitfield. Signed-off-by: Mikulas Patocka Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 6 +++--- net/core/filter.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 2d1a5b93d91c..6b45f649eff0 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -933,7 +933,7 @@ void bpf_jit_compile(struct bpf_prog *fp) set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); return; diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index cfa83cf2447d..0e97ccd29fe3 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1417,7 +1417,7 @@ void bpf_jit_compile(struct bpf_prog *fp) bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 40c53ff59124..cbae2dfd053c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -686,7 +686,7 @@ void bpf_jit_compile(struct bpf_prog *fp) ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index b734f975c22e..555f5c7e83ab 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -842,7 +842,7 @@ void bpf_jit_compile(struct bpf_prog *fp) if (jit.start) { set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *) jit.start; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f7a736b645e8..b2ad9dc5425e 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -801,7 +801,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (image) { bpf_flush_icache(image, image + proglen); fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9de0b5476b0c..d56cd1f515bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -955,7 +955,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = 1; + prog->jited = true; } out: kfree(addrs); diff --git a/include/linux/filter.h b/include/linux/filter.h index 868764fcffb8..4b59edead908 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -300,9 +300,9 @@ struct bpf_work_struct { }; struct bpf_prog { - u32 pages; /* Number of allocated pages */ - u32 jited:1, /* Is our filter JIT'ed? */ - len:31; /* Number of filter blocks */ + u16 pages; /* Number of allocated pages */ + bool jited; /* Is our filter JIT'ed? */ + u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index fa5b7d0f77ac..dfc716ffa44b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -972,7 +972,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) int err; fp->bpf_func = NULL; - fp->jited = 0; + fp->jited = false; err = bpf_check_classic(fp->insns, fp->len); if (err) { -- cgit v1.2.3 From 17448e5f63c8f36d00532327ae65e253d1395b08 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 8 Sep 2014 23:33:01 +0200 Subject: net_sched: sfq: remove unused macro not used anymore since ddecf0f (net_sched: sfq: add optional RED on top of SFQ). Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 1af2f73906d0..211db9017c35 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -310,11 +310,6 @@ static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) slot->skblist_prev = skb; } -#define slot_queue_walk(slot, skb) \ - for (skb = slot->skblist_next; \ - skb != (struct sk_buff *)slot; \ - skb = skb->next) - static unsigned int sfq_drop(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); -- cgit v1.2.3 From 67cc0d4077951295f42bed63805e91b46c24477b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Sep 2014 19:58:58 -0400 Subject: net-timestamp: optimize sock_tx_timestamp default path Few packets have timestamping enabled. Exit sock_tx_timestamp quickly in this common case. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++++- net/socket.c | 7 ++----- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 049ab1b732a6..515a4d01e932 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2199,6 +2199,8 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, sk->sk_stamp = skb->tstamp; } +void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags); + /** * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet @@ -2206,7 +2208,13 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, * * Note : callers should take care of initial *tx_flags value (usually 0) */ -void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags); +static inline void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +{ + if (unlikely(sk->sk_tsflags)) + __sock_tx_timestamp(sk, tx_flags); + if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) + *tx_flags |= SKBTX_WIFI_STATUS; +} /** * sk_eat_skb - Release a skb if it is no longer needed diff --git a/net/socket.c b/net/socket.c index 2e2586e2dee1..d40f522541aa 100644 --- a/net/socket.c +++ b/net/socket.c @@ -610,7 +610,7 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) { u8 flags = *tx_flags; @@ -626,12 +626,9 @@ void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) flags |= SKBTX_ACK_TSTAMP; - if (sock_flag(sk, SOCK_WIFI_STATUS)) - flags |= SKBTX_WIFI_STATUS; - *tx_flags = flags; } -EXPORT_SYMBOL(sock_tx_timestamp); +EXPORT_SYMBOL(__sock_tx_timestamp); static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) -- cgit v1.2.3 From cbeddd5d163ba7a6cd96a96509f7043cce4f68d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Sep 2014 13:07:32 +0200 Subject: ipv6: mcast: remove dead debugging defines It's not used anywhere, so just remove these. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 6833dd07b2c2..484a94215d88 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -64,15 +64,6 @@ #include -/* Set to 3 to get tracing... */ -#define MCAST_DEBUG 2 - -#if MCAST_DEBUG >= 3 -#define MDBG(x) printk x -#else -#define MDBG(x) -#endif - /* Ensure that we have struct in6_addr aligned on 32bit word. */ static void *__mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), -- cgit v1.2.3 From 8e380f004e8e3c071590426a64210331051a4c42 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2014 08:11:41 -0700 Subject: ipv4: rcu cleanup in ip_ra_control() Remove one sparse warning : net/ipv4/ip_sockglue.c:328:22: warning: incorrect type in assignment (different address spaces) net/ipv4/ip_sockglue.c:328:22: expected struct ip_ra_chain [noderef] *next net/ipv4/ip_sockglue.c:328:22: got struct ip_ra_chain *[assigned] ra And replace one rcu_assign_ptr() by RCU_INIT_POINTER() where applicable. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 455e75bcb167..c373a9ad4555 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -303,7 +303,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; - rcu_assign_pointer(*rap, ra->next); + RCU_INIT_POINTER(*rap, ra->next); spin_unlock_bh(&ip_ra_lock); if (ra->destructor) @@ -325,7 +325,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra->sk = sk; new_ra->destructor = destructor; - new_ra->next = ra; + RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); spin_unlock_bh(&ip_ra_lock); -- cgit v1.2.3 From cc9c668a0805cdf6141ffae3d8e9a94875ec7a54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2014 08:16:17 -0700 Subject: ipv6: udp6_gro_complete() is static net/ipv6/udp_offload.c:159:5: warning: symbol 'udp6_gro_complete' was not declared. Should it be static? Signed-off-by: Eric Dumazet Fixes: 57c67ff4bd92 ("udp: additional GRO support") Cc: Tom Herbert Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 89cb9a9b8537..a1ad34b1c4ec 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -156,7 +156,7 @@ flush: return NULL; } -int udp6_gro_complete(struct sk_buff *skb, int nhoff) +static int udp6_gro_complete(struct sk_buff *skb, int nhoff) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); -- cgit v1.2.3 From 416c51e17b8b31b574763ac2b88ee99ddbb0c85d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2014 08:24:53 -0700 Subject: netns: remove one sparse warning net/core/net_namespace.c:227:18: warning: incorrect type in argument 1 (different address spaces) net/core/net_namespace.c:227:18: expected void const * net/core/net_namespace.c:227:18: got struct net_generic [noderef] *gen We can use rcu_access_pointer() here as read-side access to the pointer was removed at least one grace period ago. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7c6b51a58968..7f155175bba8 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -224,7 +224,7 @@ static void net_free(struct net *net) return; } #endif - kfree(net->gen); + kfree(rcu_access_pointer(net->gen)); kmem_cache_free(net_cachep, net); } -- cgit v1.2.3 From 72bb17b37b9076e12b388feee4a52e85ef8f6620 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2014 08:29:12 -0700 Subject: ipv4: udp4_gro_complete() is static net/ipv4/udp_offload.c:339:5: warning: symbol 'udp4_gro_complete' was not declared. Should it be static? Signed-off-by: Eric Dumazet Cc: Tom Herbert Fixes: 57c67ff4bd92 ("udp: additional GRO support") Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 84e0e05c9c0e..52d5f46abf86 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -336,7 +336,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) return err; } -int udp4_gro_complete(struct sk_buff *skb, int nhoff) +static int udp4_gro_complete(struct sk_buff *skb, int nhoff) { const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); -- cgit v1.2.3 From 46cfd725c377bc5bb32b56b5151d6de4cb5a71e3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2014 01:08:46 +0200 Subject: net: use kfree_skb_list() helper in more places Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 6 +----- net/xfrm/xfrm_output.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b7a3e7b3378e..2e6a0dbf7fb3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -701,11 +701,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) return 0; } - while (frag) { - skb = frag->next; - kfree_skb(frag); - frag = skb; - } + kfree_skb_list(frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index c51e8f7b8653..499d6c18a8ce 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -166,11 +166,7 @@ static int xfrm_output_gso(struct sk_buff *skb) err = xfrm_output2(segs); if (unlikely(err)) { - while ((segs = nskb)) { - nskb = segs->next; - segs->next = NULL; - kfree_skb(segs); - } + kfree_skb_list(nskb); return err; } -- cgit v1.2.3 From 03d56daafe9d4e04a8a0d305789cd3eda250746b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 9 Sep 2014 11:23:14 -0700 Subject: ipv6: Clear flush_id to make GRO work In TCP gro we check flush_id which is derived from the IP identifier. In IPv4 gro path the flush_id is set with the expectation that every matched packet increments IP identifier. In IPv6, the flush_id is never set and thus is uinitialized. What's worse is that in IPv6 over IPv4 encapsulation, the IP identifier is taken from the outer header which is currently not incremented on every packet for Linux stack, so GRO in this case never matches packets (identifier is not increasing). This patch clears flush_id for every time for a matched packet in IPv6 gro_receive. We need to do this each time to overwrite the setting that would be done in IPv4 gro_receive per the outer header in IPv6 over Ipv4 encapsulation. Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 5bcda338bcef..929bbbcd0c8e 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -261,6 +261,9 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, /* flush if Traffic Class fields are different */ NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; + + /* Clear flush_id, there's really no concept of ID in IPv6. */ + NAPI_GRO_CB(p)->flush_id = 0; } NAPI_GRO_CB(skb)->flush |= flush; -- cgit v1.2.3 From 9667e9bb3f366435dde74f22578876daae850feb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 9 Sep 2014 11:23:15 -0700 Subject: ipip: Add gro callbacks to ipip offload Add inet_gro_receive and inet_gro_complete to ipip_offload to support GRO. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b537bd94906c..72011cc4c13b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1666,6 +1666,8 @@ static const struct net_offload ipip_offload = { .callbacks = { .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, + .gro_receive = inet_gro_receive, + .gro_complete = inet_gro_complete, }, }; -- cgit v1.2.3 From 19424e052fb44da2f00d1a868cbb51f3e9f4bbb5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 9 Sep 2014 11:23:16 -0700 Subject: sit: Add gro callbacks to sit_offload Add ipv6_gro_receive and ipv6_gro_complete to sit_offload to support GRO. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 929bbbcd0c8e..9952f3fce30a 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -317,6 +317,8 @@ static const struct net_offload sit_offload = { .callbacks = { .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, }, }; -- cgit v1.2.3 From ef423a410943dab9198ec1d7d9558cb53a9569cc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 9 Sep 2014 21:17:28 -0700 Subject: atm: Convert pr_warning to pr_warn Use the more common pr_warn. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/atm/clip.c | 4 ++-- net/atm/common.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index 46339040fef0..1d9eaa4f041a 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -384,7 +384,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ if (old) { - pr_warning("XOFF->XOFF transition\n"); + pr_warn("XOFF->XOFF transition\n"); goto out_release_neigh; } dev->stats.tx_packets++; @@ -447,7 +447,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) struct rtable *rt; if (vcc->push != clip_push) { - pr_warning("non-CLIP VCC\n"); + pr_warn("non-CLIP VCC\n"); return -EBADF; } clip_vcc = CLIP_VCC(vcc); diff --git a/net/atm/common.c b/net/atm/common.c index 7b491006eaf4..6a765156a3f6 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -300,7 +300,7 @@ static int adjust_tp(struct atm_trafprm *tp, unsigned char aal) max_sdu = ATM_MAX_AAL34_PDU; break; default: - pr_warning("AAL problems ... (%d)\n", aal); + pr_warn("AAL problems ... (%d)\n", aal); /* fall through */ case ATM_AAL5: max_sdu = ATM_MAX_AAL5_PDU; -- cgit v1.2.3 From 294a0b7f3148e2a4e916965a6d14838e08143ba8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 9 Sep 2014 21:17:30 -0700 Subject: pktgen: Convert pr_warning to pr_warn Use the more common pr_warn. Realign arguments. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/core/pktgen.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 21cb4839bc97..5c728aaf8d6c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -506,7 +506,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, pktgen_reset_all_threads(pn); else - pr_warning("Unknown command: %s\n", data); + pr_warn("Unknown command: %s\n", data); return count; } @@ -861,14 +861,14 @@ static ssize_t pktgen_if_write(struct file *file, pg_result = &(pkt_dev->result[0]); if (count < 1) { - pr_warning("wrong command format\n"); + pr_warn("wrong command format\n"); return -EINVAL; } max = count; tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { - pr_warning("illegal format\n"); + pr_warn("illegal format\n"); return tmp; } i = tmp; @@ -2056,15 +2056,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ntxq = pkt_dev->odev->real_num_tx_queues; if (ntxq <= pkt_dev->queue_map_min) { - pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_min = (ntxq ?: 1) - 1; } if (pkt_dev->queue_map_max >= ntxq) { - pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_max = (ntxq ?: 1) - 1; } @@ -3173,8 +3173,8 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; if (!pkt_dev->running) { - pr_warning("interface: %s is already stopped\n", - pkt_dev->odevname); + pr_warn("interface: %s is already stopped\n", + pkt_dev->odevname); return -EINVAL; } @@ -3692,7 +3692,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { - pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); + pr_warn("WARNING: trying to remove a running interface, stopping it now\n"); pktgen_stop_device(pkt_dev); } -- cgit v1.2.3 From 47c4cfc37fb71e0fa801a4ed9228de83404abfce Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 9 Sep 2014 21:17:31 -0700 Subject: iucv: Convert pr_warning to pr_warn Use the more common pr_warn. Coalesce formats. Realign arguments. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/iucv/iucv.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index da787930df0a..2a6a1fdd62c0 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -493,8 +493,8 @@ static void iucv_declare_cpu(void *data) err = "Paging or storage error"; break; } - pr_warning("Defining an interrupt buffer on CPU %i" - " failed with 0x%02x (%s)\n", cpu, rc, err); + pr_warn("Defining an interrupt buffer on CPU %i failed with 0x%02x (%s)\n", + cpu, rc, err); return; } @@ -1831,7 +1831,7 @@ static void iucv_external_interrupt(struct ext_code ext_code, BUG_ON(p->iptype < 0x01 || p->iptype > 0x09); work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); if (!work) { - pr_warning("iucv_external_interrupt: out of memory\n"); + pr_warn("iucv_external_interrupt: out of memory\n"); return; } memcpy(&work->data, p, sizeof(work->data)); @@ -1974,8 +1974,7 @@ static int iucv_pm_restore(struct device *dev) printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table); #endif if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table) - pr_warning("Suspending Linux did not completely close all IUCV " - "connections\n"); + pr_warn("Suspending Linux did not completely close all IUCV connections\n"); iucv_pm_state = IUCV_PM_RESTORING; if (cpumask_empty(&iucv_irq_cpumask)) { rc = iucv_query_maxconn(); -- cgit v1.2.3 From b167a37c7bbc6f7589f439ba7d9a49af5ad37ff5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 9 Sep 2014 21:17:32 -0700 Subject: netfilter: Convert pr_warning to pr_warn Use the more common pr_warn. Other miscellanea: o Coalesce formats o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/netfilter/ipset/ip_set_core.c | 23 +++++++++++----------- net/netfilter/ipset/ip_set_hash_gen.h | 12 ++++++------ net/netfilter/xt_connbytes.c | 2 +- net/netfilter/xt_hashlimit.c | 2 +- net/netfilter/xt_set.c | 36 ++++++++++++++++------------------- 5 files changed, 35 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index ec8114fae50b..5593e97426c4 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -101,7 +101,7 @@ load_settype(const char *name) nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { - pr_warning("Can't find ip_set type %s\n", name); + pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); return false; } @@ -195,20 +195,19 @@ ip_set_type_register(struct ip_set_type *type) int ret = 0; if (type->protocol != IPSET_PROTOCOL) { - pr_warning("ip_set type %s, family %s, revision %u:%u uses " - "wrong protocol version %u (want %u)\n", - type->name, family_name(type->family), - type->revision_min, type->revision_max, - type->protocol, IPSET_PROTOCOL); + pr_warn("ip_set type %s, family %s, revision %u:%u uses wrong protocol version %u (want %u)\n", + type->name, family_name(type->family), + type->revision_min, type->revision_max, + type->protocol, IPSET_PROTOCOL); return -EINVAL; } ip_set_type_lock(); if (find_set_type(type->name, type->family, type->revision_min)) { /* Duplicate! */ - pr_warning("ip_set type %s, family %s with revision min %u " - "already registered!\n", type->name, - family_name(type->family), type->revision_min); + pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", + type->name, family_name(type->family), + type->revision_min); ret = -EINVAL; goto unlock; } @@ -228,9 +227,9 @@ ip_set_type_unregister(struct ip_set_type *type) { ip_set_type_lock(); if (!find_set_type(type->name, type->family, type->revision_min)) { - pr_warning("ip_set type %s, family %s with revision min %u " - "not registered\n", type->name, - family_name(type->family), type->revision_min); + pr_warn("ip_set type %s, family %s with revision min %u not registered\n", + type->name, family_name(type->family), + type->revision_min); goto unlock; } list_del_rcu(&type->list); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 61c7fb052802..80f49dfba555 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -565,8 +565,8 @@ retry: set->name, orig->htable_bits, htable_bits, orig); if (!htable_bits) { /* In case we have plenty of memory :-) */ - pr_warning("Cannot increase the hashsize of set %s further\n", - set->name); + pr_warn("Cannot increase the hashsize of set %s further\n", + set->name); return -IPSET_ERR_HASH_FULL; } t = ip_set_alloc(sizeof(*t) @@ -651,8 +651,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (h->elements >= h->maxelem) { if (net_ratelimit()) - pr_warning("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); return -IPSET_ERR_HASH_FULL; } @@ -998,8 +998,8 @@ mtype_list(const struct ip_set *set, nla_put_failure: nlmsg_trim(skb, incomplete); if (unlikely(first == cb->args[IPSET_CB_ARG0])) { - pr_warning("Can't list set %s: one bucket does not fit into " - "a message. Please report it!\n", set->name); + pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", + set->name); cb->args[IPSET_CB_ARG0] = 0; return -EMSGSIZE; } diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 1e634615ab9d..d4bec261e74e 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -120,7 +120,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { - pr_warning("Forcing CT accounting to be enabled\n"); + pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 52eb3e03458d..05fbc2a0be46 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -943,7 +943,7 @@ static int __init hashlimit_mt_init(void) sizeof(struct dsthash_ent), 0, 0, NULL); if (!hashlimit_cachep) { - pr_warning("unable to create slab cache\n"); + pr_warn("unable to create slab cache\n"); goto err2; } return 0; diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 80c2e2d603e0..cb70f6ec5695 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -84,13 +84,12 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find set identified by id %u to match\n", - info->match_set.index); + pr_warn("Cannot find set identified by id %u to match\n", + info->match_set.index); return -ENOENT; } if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { - pr_warning("Protocol error: set match dimension " - "is over the limit!\n"); + pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -134,13 +133,12 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find set identified by id %u to match\n", - info->match_set.index); + pr_warn("Cannot find set identified by id %u to match\n", + info->match_set.index); return -ENOENT; } if (info->match_set.dim > IPSET_DIM_MAX) { - pr_warning("Protocol error: set match dimension " - "is over the limit!\n"); + pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -230,8 +228,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -239,8 +237,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) if (info->del_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; @@ -248,8 +246,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) } if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { - pr_warning("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -303,8 +300,8 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -312,8 +309,8 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) if (info->del_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; @@ -321,8 +318,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) } if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX) { - pr_warning("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) -- cgit v1.2.3 From 0fc4dffad13e81deb3bf72e74cac292172df5285 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Wed, 10 Sep 2014 14:02:50 +0200 Subject: tipc: fix sparse warnings This fixes the following sparse warnings: sparse: symbol 'tipc_update_nametbl' was not declared. Should it be static? Also, the function is changed to return bool upon success, rather than a potentially freed pointer. Signed-off-by: Erik Hugne Reported-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 780ef710a849..376d2bb51d8d 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -284,8 +284,7 @@ static void named_purge_publ(struct publication *publ) * tipc_nametbl_lock must be held. * Returns the publication item if successful, otherwise NULL. */ -struct publication *tipc_update_nametbl(struct distr_item *i, u32 node, - u32 dtype) +static bool tipc_update_nametbl(struct distr_item *i, u32 node, u32 dtype) { struct publication *publ = NULL; @@ -298,6 +297,7 @@ struct publication *tipc_update_nametbl(struct distr_item *i, u32 node, tipc_nodesub_subscribe(&publ->subscr, node, publ, (net_ev_handler) named_purge_publ); + return true; } } else if (dtype == WITHDRAWAL) { publ = tipc_nametbl_remove_publ(ntohl(i->type), ntohl(i->lower), @@ -306,11 +306,12 @@ struct publication *tipc_update_nametbl(struct distr_item *i, u32 node, if (publ) { tipc_nodesub_unsubscribe(&publ->subscr); kfree(publ); + return true; } } else { pr_warn("Unrecognized name table message received\n"); } - return publ; + return false; } /** -- cgit v1.2.3 From 3a7dbfb8ff943711be4221df978254ad2bc1ac46 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:37:41 -0700 Subject: Bluetooth: Remove unnecessary early initialization of variable We do nothing else with the auth variable in smp_cmd_pairing_rsp() besides passing it to tk_request() which in turn only cares about whether one of the sides had the MITM bit set. It is therefore unnecessary to assign a value to it until just before calling tk_request(), and this value can simply be the bit-wise or of the local and remote requirements. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index dc575aba2e65..dbd17a07dc2e 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1003,7 +1003,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing *req, *rsp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; - u8 key_size, auth = SMP_AUTH_NONE; + u8 key_size, auth; int ret; BT_DBG("conn %p", conn); @@ -1044,11 +1044,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) */ smp->remote_key_dist &= rsp->resp_key_dist; - if ((req->auth_req & SMP_AUTH_BONDING) && - (rsp->auth_req & SMP_AUTH_BONDING)) - auth = SMP_AUTH_BONDING; - - auth |= (req->auth_req | rsp->auth_req) & SMP_AUTH_MITM; + auth = (req->auth_req | rsp->auth_req); ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability); if (ret) -- cgit v1.2.3 From c05b9339c8a448a2df0c8598424ea9c0933288d1 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:37:42 -0700 Subject: Bluetooth: Fix ignoring unknown SMP authentication requirement bits The SMP specification states that we should ignore any unknown bits from the authentication requirement. We already have a define for masking out unknown bits but we haven't used it in all places so far. This patch adds usage of the AUTH_REQ_MASK to all places that need it and ensures that we don't pass unknown bits onward to other functions. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index dbd17a07dc2e..ef8f96d2c059 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -949,8 +949,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (!smp) return SMP_UNSPECIFIED; + /* We didn't start the pairing, so match remote */ + auth = req->auth_req & AUTH_REQ_MASK; + if (!test_bit(HCI_BONDABLE, &hdev->dev_flags) && - (req->auth_req & SMP_AUTH_BONDING)) + (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); @@ -959,9 +962,6 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); - /* We didn't start the pairing, so match remote */ - auth = req->auth_req; - sec_level = authreq_to_seclevel(auth); if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; @@ -1024,6 +1024,8 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; + auth = rsp->auth_req & AUTH_REQ_MASK; + /* If we need MITM check that it can be acheived */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; @@ -1044,7 +1046,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) */ smp->remote_key_dist &= rsp->resp_key_dist; - auth = (req->auth_req | rsp->auth_req); + auth |= req->auth_req; ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability); if (ret) @@ -1160,7 +1162,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; - u8 sec_level; + u8 sec_level, auth; BT_DBG("conn %p", conn); @@ -1170,7 +1172,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) if (hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; - sec_level = authreq_to_seclevel(rp->auth_req); + auth = rp->auth_req & AUTH_REQ_MASK; + + sec_level = authreq_to_seclevel(auth); if (smp_sufficient_security(hcon, sec_level)) return 0; @@ -1185,13 +1189,13 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) return SMP_UNSPECIFIED; if (!test_bit(HCI_BONDABLE, &hcon->hdev->dev_flags) && - (rp->auth_req & SMP_AUTH_BONDING)) + (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; skb_pull(skb, sizeof(*rp)); memset(&cp, 0, sizeof(cp)); - build_pairing_cmd(conn, &cp, NULL, rp->auth_req); + build_pairing_cmd(conn, &cp, NULL, auth); smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], &cp, sizeof(cp)); -- cgit v1.2.3 From 24bd0bd94e0947e257c5cd6a85b0e337d953e79c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:37:43 -0700 Subject: Bluetooth: Centralize disallowing SMP commands to a single place All the cases where we mark SMP commands as dissalowed are their respective command handlers. We can therefore simplify the code by always clearing the bit immediately after testing it. This patch converts the corresponding test_bit() call to a test_and_clear_bit() call and also removes the now unused SMP_DISALLOW_CMD macro. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ef8f96d2c059..be8371b4eb63 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -32,7 +32,6 @@ #include "smp.h" #define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) -#define SMP_DISALLOW_CMD(smp, code) clear_bit(code, &smp->allow_cmd) #define SMP_TIMEOUT msecs_to_jiffies(30000) @@ -956,8 +955,6 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; - SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); - smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); @@ -1014,8 +1011,6 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (conn->hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; - SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); - skb_pull(skb, sizeof(*rsp)); req = (void *) &smp->preq[1]; @@ -1071,8 +1066,6 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; - SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); - memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); @@ -1101,8 +1094,6 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(smp->rrnd)) return SMP_INVALID_PARAMS; - SMP_DISALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); - memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd)); skb_pull(skb, sizeof(smp->rrnd)); @@ -1293,7 +1284,6 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - SMP_DISALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); skb_pull(skb, sizeof(*rp)); @@ -1321,7 +1311,6 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; - SMP_DISALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) @@ -1353,7 +1342,6 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; - SMP_DISALLOW_CMD(smp, SMP_CMD_IDENT_INFO); SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); @@ -1380,7 +1368,6 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ID_KEY; - SMP_DISALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); @@ -1436,8 +1423,6 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_SIGN; - SMP_DISALLOW_CMD(smp, SMP_CMD_SIGN_INFO); - skb_pull(skb, sizeof(*rp)); hci_dev_lock(hdev); @@ -1482,7 +1467,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) if (code > SMP_CMD_MAX) goto drop; - if (smp && !test_bit(code, &smp->allow_cmd)) + if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) goto drop; /* If we don't have a context the only allowed commands are -- cgit v1.2.3 From 1afc2a1ab6612dcc3f26db7ca1afba9cff359f1c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:37:44 -0700 Subject: Bluetooth: Fix SMP security level when we have no IO capabilities When the local IO capability is NoInputNoOutput any attempt to convert the remote authentication requirement to a target security level is futile. This patch makes sure that we set the target security level at most to MEDIUM if the local IO capability is NoInputNoOutput. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index be8371b4eb63..a08b077cb725 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -959,7 +959,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); - sec_level = authreq_to_seclevel(auth); + if (conn->hcon->io_capability == 0x03) + sec_level = BT_SECURITY_MEDIUM; + else + sec_level = authreq_to_seclevel(auth); + if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; @@ -1165,7 +1169,11 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) auth = rp->auth_req & AUTH_REQ_MASK; - sec_level = authreq_to_seclevel(auth); + if (hcon->io_capability == 0x03) + sec_level = BT_SECURITY_MEDIUM; + else + sec_level = authreq_to_seclevel(auth); + if (smp_sufficient_security(hcon, sec_level)) return 0; -- cgit v1.2.3 From a6f7833ca353d50de46e3532afebe4abfc5dc4d9 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:37:45 -0700 Subject: Bluetooth: Add smp_ltk_sec_level() helper function There are several places that need to determine the security level that an LTK can provide. This patch adds a convenience function for this to help make the code more readable. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 5 +---- net/bluetooth/smp.c | 2 +- net/bluetooth/smp.h | 8 ++++++++ 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3a8381ab992f..603a17cc52ac 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4506,10 +4506,7 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) memcpy(cp.ltk, ltk->val, sizeof(ltk->val)); cp.handle = cpu_to_le16(conn->handle); - if (ltk->authenticated) - conn->pending_sec_level = BT_SECURITY_HIGH; - else - conn->pending_sec_level = BT_SECURITY_MEDIUM; + conn->pending_sec_level = smp_ltk_sec_level(ltk); conn->enc_key_size = ltk->enc_size; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a08b077cb725..3700dd8d9d0b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1114,7 +1114,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) if (!key) return false; - if (sec_level > BT_SECURITY_MEDIUM && !key->authenticated) + if (smp_ltk_sec_level(key) < sec_level) return false; if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 5240537efde3..86a683a8b491 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -125,6 +125,14 @@ enum { SMP_LTK_SLAVE, }; +static inline u8 smp_ltk_sec_level(struct smp_ltk *key) +{ + if (key->authenticated) + return BT_SECURITY_HIGH; + + return BT_SECURITY_MEDIUM; +} + /* SMP Commands */ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -- cgit v1.2.3 From aeaeb4bbca520b862d3b3e7cd74c23042815a160 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:37:46 -0700 Subject: Bluetooth: Fix L2CAP information request handling for fixed channels Even if we have no connection-oriented channels we should perform the L2CAP Information Request procedures before notifying L2CAP channels of the connection. This is so that the L2CAP channel implementations can perform checks on what the remote side supports (e.g. does it support the fixed channel in question). So far the code has relied on the l2cap_do_start() function to initiate the Information Request, however l2cap_do_start() is used on a per-channel basis and only for connection-oriented channels. This means that if there are no connection-oriented channels on the system we would never start the Information Request procedure. This patch creates a new l2cap_request_info() helper function to initiate the Information Request procedure, and ensures that it is called whenever a BR/EDR connection has been established. The patch also updates fixed channels to be notified of connection readiness only once the Information Request procedure has completed. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 53 +++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index b71430caab4a..8d53fc57faba 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1283,6 +1283,24 @@ static void l2cap_start_connection(struct l2cap_chan *chan) } } +static void l2cap_request_info(struct l2cap_conn *conn) +{ + struct l2cap_info_req req; + + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) + return; + + req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; + conn->info_ident = l2cap_get_ident(conn); + + schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT); + + l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ, + sizeof(req), &req); +} + static void l2cap_do_start(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; @@ -1292,26 +1310,17 @@ static void l2cap_do_start(struct l2cap_chan *chan) return; } - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { - if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) - return; - - if (l2cap_chan_check_security(chan, true) && - __l2cap_no_conn_pending(chan)) { - l2cap_start_connection(chan); - } - } else { - struct l2cap_info_req req; - req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); - - conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; - conn->info_ident = l2cap_get_ident(conn); + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)) { + l2cap_request_info(conn); + return; + } - schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT); + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) + return; - l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ, - sizeof(req), &req); - } + if (l2cap_chan_check_security(chan, true) && + __l2cap_no_conn_pending(chan)) + l2cap_start_connection(chan); } static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask) @@ -1370,6 +1379,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_lock(chan); if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + l2cap_chan_ready(chan); l2cap_chan_unlock(chan); continue; } @@ -1474,6 +1484,9 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) BT_DBG("conn %p", conn); + if (hcon->type == ACL_LINK) + l2cap_request_info(conn); + mutex_lock(&conn->chan_lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1488,8 +1501,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == LE_LINK) { l2cap_le_start(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { - l2cap_chan_ready(chan); - + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) + l2cap_chan_ready(chan); } else if (chan->state == BT_CONNECT) { l2cap_do_start(chan); } -- cgit v1.2.3 From 5be5e275ad214bbb420425754354add679d8ab68 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 17:58:54 -0700 Subject: Bluetooth: Avoid hard-coded IO capability values in SMP This is a trivial change to use a proper define for the NoInputNoOutput IO capability instead of hard-coded values. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3700dd8d9d0b..51fc7db2d84e 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -959,7 +959,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); - if (conn->hcon->io_capability == 0x03) + if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); @@ -1169,7 +1169,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) auth = rp->auth_req & AUTH_REQ_MASK; - if (hcon->io_capability == 0x03) + if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); -- cgit v1.2.3 From 7ed3fa20780a5efd22bb192be0908468e7c376ed Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 10 Sep 2014 22:16:35 -0700 Subject: Bluetooth: Expire RPA if encryption fails If encryption fails and we're using an RPA it may be because of a conflict with another device. To avoid repeated failures the safest action is to simply mark the RPA as expired so that a new one gets generated as soon as the connection drops. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 603a17cc52ac..8b0a2a6de419 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2438,6 +2438,12 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) } } + /* We should disregard the current RPA and generate a new one + * whenever the encryption procedure fails. + */ + if (ev->status && conn->type == LE_LINK) + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { -- cgit v1.2.3 From b47f610bd6e88f9d1032132d81b23c928a645e9d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2014 13:39:54 +0300 Subject: cfg80211: clear connect keys when freeing them When freeing the connect keys, clear the memory to avoid having the key material stick around in memory "forever". Signed-off-by: Johannes Berg --- net/wireless/ibss.c | 4 ++-- net/wireless/nl80211.c | 8 ++++---- net/wireless/sme.c | 6 +++--- net/wireless/util.c | 2 +- net/wireless/wext-sme.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 8f345da3ea5f..e24fc585c883 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -115,7 +115,7 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, } if (WARN_ON(wdev->connect_keys)) - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = connkeys; wdev->ibss_fixed = params->channel_fixed; @@ -161,7 +161,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) ASSERT_WDEV_LOCK(wdev); - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; rdev_set_qos_map(rdev, dev, NULL); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cf178d2b621d..e388a9f28895 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6866,7 +6866,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); if (err) - kfree(connkeys); + kzfree(connkeys); return err; } @@ -7235,7 +7235,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) { if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) { - kfree(connkeys); + kzfree(connkeys); return -EINVAL; } memcpy(&connect.ht_capa, @@ -7253,7 +7253,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) { if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) { - kfree(connkeys); + kzfree(connkeys); return -EINVAL; } memcpy(&connect.vht_capa, @@ -7273,7 +7273,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); if (err) - kfree(connkeys); + kzfree(connkeys); return err; } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 8bbeeb302216..dc1668ff543b 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -641,7 +641,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, } if (status != WLAN_STATUS_SUCCESS) { - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; if (bss) { @@ -918,7 +918,7 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->connect_keys)) { - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; } @@ -978,7 +978,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; if (wdev->conn) diff --git a/net/wireless/util.c b/net/wireless/util.c index a8b2816ffcb9..5e233a577d0f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -797,7 +797,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) netdev_err(dev, "failed to set mgtdef %d\n", i); } - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; } diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index c7e5c8eb4f24..368611c05739 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -57,7 +57,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, err = cfg80211_connect(rdev, wdev->netdev, &wdev->wext.connect, ck, prev_bssid); if (err) - kfree(ck); + kzfree(ck); return err; } -- cgit v1.2.3 From 29c3f9c3996abea060fa6e0b9e6a30d3f0cc828c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2014 13:39:55 +0300 Subject: mac80211: clear key material when freeing keys When freeing the key, clear the memory to avoid having the key material stick around in memory "forever". Signed-off-by: Johannes Berg --- net/mac80211/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index f320a04380b9..4712150dc210 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -422,7 +422,7 @@ static void ieee80211_key_free_common(struct ieee80211_key *key) ieee80211_aes_key_free(key->u.ccmp.tfm); if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); - kfree(key); + kzfree(key); } static void __ieee80211_key_destroy(struct ieee80211_key *key, -- cgit v1.2.3 From 538c9eb8b3fd33d3a0722b2c04ec4f574eaa6e9f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2014 13:39:56 +0300 Subject: cfg80211: clear wext keys when freeing and removing them When freeing the keys stored for wireless extensions, clear the memory to avoid having the key material stick around in memory "forever". Similarly, when userspace overwrites a key, actually clear it instead of just setting the key length to zero. Signed-off-by: Johannes Berg --- net/wireless/core.c | 2 +- net/wireless/wext-compat.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 9698fe709251..55ec9be9feb7 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1007,7 +1007,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); #ifdef CONFIG_CFG80211_WEXT - kfree(wdev->wext.keys); + kzfree(wdev->wext.keys); #endif } /* diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 11120bb14162..0f47948c572f 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -496,6 +496,8 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, err = 0; if (!err) { if (!addr) { + memset(wdev->wext.keys->data[idx], 0, + sizeof(wdev->wext.keys->data[idx])); wdev->wext.keys->params[idx].key_len = 0; wdev->wext.keys->params[idx].cipher = 0; } -- cgit v1.2.3 From 78f686cae0c67a2edd167cbbe2f36017f0fa4b30 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2014 22:28:06 +0300 Subject: cfg80211: don't put kek/kck/replay counter on the stack There's no need to put the values on the stack, just pass a pointer to the data in the nl80211 message. This reduces stack usage and avoids potential issues with putting sensitive data on the stack. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++------ net/wireless/nl80211.c | 10 +++------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0d17ec9df692..c2c710c14a50 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1980,14 +1980,12 @@ struct cfg80211_wowlan_wakeup { /** * struct cfg80211_gtk_rekey_data - rekey data - * @kek: key encryption key - * @kck: key confirmation key - * @replay_ctr: replay counter + * @kek: key encryption key (NL80211_KEK_LEN bytes) + * @kck: key confirmation key (NL80211_KCK_LEN bytes) + * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) */ struct cfg80211_gtk_rekey_data { - u8 kek[NL80211_KEK_LEN]; - u8 kck[NL80211_KCK_LEN]; - u8 replay_ctr[NL80211_REPLAY_CTR_LEN]; + const u8 *kek, *kck, *replay_ctr; }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e388a9f28895..bebdf3d0ae75 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8959,13 +8959,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) return -ERANGE; - memcpy(rekey_data.kek, nla_data(tb[NL80211_REKEY_DATA_KEK]), - NL80211_KEK_LEN); - memcpy(rekey_data.kck, nla_data(tb[NL80211_REKEY_DATA_KCK]), - NL80211_KCK_LEN); - memcpy(rekey_data.replay_ctr, - nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]), - NL80211_REPLAY_CTR_LEN); + rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); + rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); wdev_lock(wdev); if (!wdev->current_bss) { -- cgit v1.2.3 From 5393b917bcbb0ce0338668c89397137bd2b7436e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2014 15:00:16 +0300 Subject: cfg80211: clear nl80211 messages carrying keys after processing Clear any nl80211 messages that might contain keys after processing them to avoid leaving their data in memory "forever" after they've been freed. Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bebdf3d0ae75..e9fbd4f4ddb0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9400,6 +9400,7 @@ static int nl80211_set_qos_map(struct sk_buff *skb, /* If a netdev is associated, it must be UP, P2P must be started */ #define NL80211_FLAG_NEED_WDEV_UP (NL80211_FLAG_NEED_WDEV |\ NL80211_FLAG_CHECK_NETDEV_UP) +#define NL80211_FLAG_CLEAR_SKB 0x20 static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -9483,8 +9484,20 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, dev_put(info->user_ptr[1]); } } + if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) rtnl_unlock(); + + /* If needed, clear the netlink message payload from the SKB + * as it might contain key data that shouldn't stick around on + * the heap after the SKB is freed. The netlink message header + * is still needed for further processing, so leave it intact. + */ + if (ops->internal_flags & NL80211_FLAG_CLEAR_SKB) { + struct nlmsghdr *nlh = nlmsg_hdr(skb); + + memset(nlmsg_data(nlh), 0, nlmsg_len(nlh)); + } } static const struct genl_ops nl80211_ops[] = { @@ -9552,7 +9565,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_NEW_KEY, @@ -9560,7 +9574,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_KEY, @@ -9738,7 +9753,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_ASSOCIATE, @@ -9972,7 +9988,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_TDLS_MGMT, -- cgit v1.2.3 From ca12c0c83334a84581bb01daaedf1009deb09204 Mon Sep 17 00:00:00 2001 From: Thomas Huehn Date: Tue, 9 Sep 2014 23:22:13 +0200 Subject: mac80211: Unify rate statistic variables between Minstrel & Minstrel_HT Minstrel and Mintrel_HT used there own structs to keep track of rate statistics. Unify those variables in struct minstrel_rate_states and move it to rc80211_minstrel.h for common usage. This is a clean-up patch to prepare Minstrel and Minstrel_HT codebase for upcoming TPC. Signed-off-by: Thomas Huehn Acked-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel.c | 98 +++++++++++++++++---------------- net/mac80211/rc80211_minstrel.h | 43 ++++++++------- net/mac80211/rc80211_minstrel_debugfs.c | 19 ++++--- net/mac80211/rc80211_minstrel_ht.h | 22 -------- 4 files changed, 85 insertions(+), 97 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 1c1469c36dca..2baa7ed8789d 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -75,7 +75,7 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) { int j = MAX_THR_RATES; - while (j > 0 && mi->r[i].cur_tp > mi->r[tp_list[j - 1]].cur_tp) + while (j > 0 && mi->r[i].stats.cur_tp > mi->r[tp_list[j - 1]].stats.cur_tp) j--; if (j < MAX_THR_RATES - 1) memmove(&tp_list[j + 1], &tp_list[j], MAX_THR_RATES - (j + 1)); @@ -92,7 +92,7 @@ minstrel_set_rate(struct minstrel_sta_info *mi, struct ieee80211_sta_rates *rate ratetbl->rate[offset].idx = r->rix; ratetbl->rate[offset].count = r->adjusted_retry_count; ratetbl->rate[offset].count_cts = r->retry_count_cts; - ratetbl->rate[offset].count_rts = r->retry_count_rtscts; + ratetbl->rate[offset].count_rts = r->stats.retry_count_rtscts; } static void @@ -140,44 +140,46 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; + struct minstrel_rate_stats *mrs = &mi->r[i].stats; usecs = mr->perfect_tx_time; if (!usecs) usecs = 1000000; - if (unlikely(mr->attempts > 0)) { - mr->sample_skipped = 0; - mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts); - mr->succ_hist += mr->success; - mr->att_hist += mr->attempts; - mr->probability = minstrel_ewma(mr->probability, - mr->cur_prob, - EWMA_LEVEL); + if (unlikely(mrs->attempts > 0)) { + mrs->sample_skipped = 0; + mrs->cur_prob = MINSTREL_FRAC(mrs->success, + mrs->attempts); + mrs->succ_hist += mrs->success; + mrs->att_hist += mrs->attempts; + mrs->probability = minstrel_ewma(mrs->probability, + mrs->cur_prob, + EWMA_LEVEL); } else - mr->sample_skipped++; + mrs->sample_skipped++; - mr->last_success = mr->success; - mr->last_attempts = mr->attempts; - mr->success = 0; - mr->attempts = 0; + mrs->last_success = mrs->success; + mrs->last_attempts = mrs->attempts; + mrs->success = 0; + mrs->attempts = 0; /* Update throughput per rate, reset thr. below 10% success */ - if (mr->probability < MINSTREL_FRAC(10, 100)) - mr->cur_tp = 0; + if (mrs->probability < MINSTREL_FRAC(10, 100)) + mrs->cur_tp = 0; else - mr->cur_tp = mr->probability * (1000000 / usecs); + mrs->cur_tp = mrs->probability * (1000000 / usecs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mr->probability > MINSTREL_FRAC(95, 100) || - mr->probability < MINSTREL_FRAC(10, 100)) { - mr->adjusted_retry_count = mr->retry_count >> 1; + if (mrs->probability > MINSTREL_FRAC(95, 100) || + mrs->probability < MINSTREL_FRAC(10, 100)) { + mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; mr->sample_limit = 4; } else { mr->sample_limit = -1; - mr->adjusted_retry_count = mr->retry_count; + mr->adjusted_retry_count = mrs->retry_count; } if (!mr->adjusted_retry_count) mr->adjusted_retry_count = 2; @@ -190,11 +192,11 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is choosen as max_prob_rate */ - if (mr->probability >= MINSTREL_FRAC(95, 100)) { - if (mr->cur_tp >= mi->r[tmp_prob_rate].cur_tp) + if (mrs->probability >= MINSTREL_FRAC(95, 100)) { + if (mrs->cur_tp >= mi->r[tmp_prob_rate].stats.cur_tp) tmp_prob_rate = i; } else { - if (mr->probability >= mi->r[tmp_prob_rate].probability) + if (mrs->probability >= mi->r[tmp_prob_rate].stats.probability) tmp_prob_rate = i; } } @@ -240,14 +242,14 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, if (ndx < 0) continue; - mi->r[ndx].attempts += ar[i].count; + mi->r[ndx].stats.attempts += ar[i].count; if ((i != IEEE80211_TX_MAX_RATES - 1) && (ar[i + 1].idx < 0)) - mi->r[ndx].success += success; + mi->r[ndx].stats.success += success; } if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0)) - mi->sample_count++; + mi->sample_packets++; if (mi->sample_deferred > 0) mi->sample_deferred--; @@ -265,7 +267,7 @@ minstrel_get_retry_count(struct minstrel_rate *mr, unsigned int retry = mr->adjusted_retry_count; if (info->control.use_rts) - retry = max(2U, min(mr->retry_count_rtscts, retry)); + retry = max(2U, min(mr->stats.retry_count_rtscts, retry)); else if (info->control.use_cts_prot) retry = max(2U, min(mr->retry_count_cts, retry)); return retry; @@ -317,15 +319,15 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, sampling_ratio = mp->lookaround_rate; /* increase sum packet counter */ - mi->packet_count++; + mi->total_packets++; #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) return; #endif - delta = (mi->packet_count * sampling_ratio / 100) - - (mi->sample_count + mi->sample_deferred / 2); + delta = (mi->total_packets * sampling_ratio / 100) - + (mi->sample_packets + mi->sample_deferred / 2); /* delta < 0: no sampling required */ prev_sample = mi->prev_sample; @@ -333,10 +335,10 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, if (delta < 0 || (!mrr_capable && prev_sample)) return; - if (mi->packet_count >= 10000) { + if (mi->total_packets >= 10000) { mi->sample_deferred = 0; - mi->sample_count = 0; - mi->packet_count = 0; + mi->sample_packets = 0; + mi->total_packets = 0; } else if (delta > mi->n_rates * 2) { /* With multi-rate retry, not every planned sample * attempt actually gets used, due to the way the retry @@ -347,7 +349,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * starts getting worse, minstrel would start bursting * out lots of sampling frames, which would result * in a large throughput loss. */ - mi->sample_count += (delta - mi->n_rates * 2); + mi->sample_packets += (delta - mi->n_rates * 2); } /* get next random rate sample */ @@ -361,7 +363,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, */ if (mrr_capable && msr->perfect_tx_time > mr->perfect_tx_time && - msr->sample_skipped < 20) { + msr->stats.sample_skipped < 20) { /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark * packets that have the sampling rate deferred to the * second MRR stage. Increase the sample counter only @@ -375,7 +377,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, if (!msr->sample_limit != 0) return; - mi->sample_count++; + mi->sample_packets++; if (msr->sample_limit > 0) msr->sample_limit--; } @@ -384,7 +386,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].probability > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.probability > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -459,6 +461,7 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, for (i = 0; i < sband->n_bitrates; i++) { struct minstrel_rate *mr = &mi->r[n]; + struct minstrel_rate_stats *mrs = &mi->r[n].stats; unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0; unsigned int tx_time_single; unsigned int cw = mp->cw_min; @@ -471,6 +474,7 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, n++; memset(mr, 0, sizeof(*mr)); + memset(mrs, 0, sizeof(*mrs)); mr->rix = i; shift = ieee80211_chandef_get_shift(chandef); @@ -482,9 +486,9 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, /* calculate maximum number of retransmissions before * fallback (based on maximum segment size) */ mr->sample_limit = -1; - mr->retry_count = 1; + mrs->retry_count = 1; mr->retry_count_cts = 1; - mr->retry_count_rtscts = 1; + mrs->retry_count_rtscts = 1; tx_time = mr->perfect_tx_time + mi->sp_ack_dur; do { /* add one retransmission */ @@ -501,13 +505,13 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, (mr->retry_count_cts < mp->max_retry)) mr->retry_count_cts++; if ((tx_time_rtscts < mp->segment_size) && - (mr->retry_count_rtscts < mp->max_retry)) - mr->retry_count_rtscts++; + (mrs->retry_count_rtscts < mp->max_retry)) + mrs->retry_count_rtscts++; } while ((tx_time < mp->segment_size) && - (++mr->retry_count < mp->max_retry)); - mr->adjusted_retry_count = mr->retry_count; + (++mr->stats.retry_count < mp->max_retry)); + mr->adjusted_retry_count = mrs->retry_count; if (!(sband->bitrates[i].flags & IEEE80211_RATE_ERP_G)) - mr->retry_count_cts = mr->retry_count; + mr->retry_count_cts = mrs->retry_count; } for (i = n; i < sband->n_bitrates; i++) { @@ -665,7 +669,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) /* convert pkt per sec in kbps (1200 is the average pkt size used for * computing cur_tp */ - return MINSTREL_TRUNC(mi->r[idx].cur_tp) * 1200 * 8 / 1024; + return MINSTREL_TRUNC(mi->r[idx].stats.cur_tp) * 1200 * 8 / 1024; } const struct rate_control_ops mac80211_minstrel = { diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 046d1bd598a8..97eca86a4af0 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -31,6 +31,27 @@ minstrel_ewma(int old, int new, int weight) return (new * (EWMA_DIV - weight) + old * weight) / EWMA_DIV; } +struct minstrel_rate_stats { + /* current / last sampling period attempts/success counters */ + unsigned int attempts, last_attempts; + unsigned int success, last_success; + + /* total attempts/success counters */ + u64 att_hist, succ_hist; + + /* current throughput */ + unsigned int cur_tp; + + /* packet delivery probabilities */ + unsigned int cur_prob, probability; + + /* maximum retry counts */ + unsigned int retry_count; + unsigned int retry_count_rtscts; + + u8 sample_skipped; + bool retry_updated; +}; struct minstrel_rate { int bitrate; @@ -40,26 +61,10 @@ struct minstrel_rate { unsigned int ack_time; int sample_limit; - unsigned int retry_count; unsigned int retry_count_cts; - unsigned int retry_count_rtscts; unsigned int adjusted_retry_count; - u32 success; - u32 attempts; - u32 last_attempts; - u32 last_success; - u8 sample_skipped; - - /* parts per thousand */ - u32 cur_prob; - u32 probability; - - /* per-rate throughput */ - u32 cur_tp; - - u64 succ_hist; - u64 att_hist; + struct minstrel_rate_stats stats; }; struct minstrel_sta_info { @@ -73,8 +78,8 @@ struct minstrel_sta_info { u8 max_tp_rate[MAX_THR_RATES]; u8 max_prob_rate; - unsigned int packet_count; - unsigned int sample_count; + unsigned int total_packets; + unsigned int sample_packets; int sample_deferred; unsigned int sample_row; diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index fd0b9ca1570e..edde723f9f00 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -72,6 +72,7 @@ minstrel_stats_open(struct inode *inode, struct file *file) "this succ/attempt success attempts\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; + struct minstrel_rate_stats *mrs = &mi->r[i].stats; *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; @@ -81,24 +82,24 @@ minstrel_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "%3u%s", mr->bitrate / 2, (mr->bitrate & 1 ? ".5" : " ")); - tp = MINSTREL_TRUNC(mr->cur_tp / 10); - prob = MINSTREL_TRUNC(mr->cur_prob * 1000); - eprob = MINSTREL_TRUNC(mr->probability * 1000); + tp = MINSTREL_TRUNC(mrs->cur_tp / 10); + prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mrs->probability * 1000); p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " " %3u(%3u) %8llu %8llu\n", tp / 10, tp % 10, eprob / 10, eprob % 10, prob / 10, prob % 10, - mr->last_success, - mr->last_attempts, - (unsigned long long)mr->succ_hist, - (unsigned long long)mr->att_hist); + mrs->last_success, + mrs->last_attempts, + (unsigned long long)mrs->succ_hist, + (unsigned long long)mrs->att_hist); } p += sprintf(p, "\nTotal packet count:: ideal %d " "lookaround %d\n\n", - mi->packet_count - mi->sample_count, - mi->sample_count); + mi->total_packets - mi->sample_packets, + mi->sample_packets); ms->len = p - ms->buf; return 0; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index d655586773ac..5fee938b11c6 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -26,28 +26,6 @@ struct mcs_group { extern const struct mcs_group minstrel_mcs_groups[]; -struct minstrel_rate_stats { - /* current / last sampling period attempts/success counters */ - unsigned int attempts, last_attempts; - unsigned int success, last_success; - - /* total attempts/success counters */ - u64 att_hist, succ_hist; - - /* current throughput */ - unsigned int cur_tp; - - /* packet delivery probabilities */ - unsigned int cur_prob, probability; - - /* maximum retry counts */ - unsigned int retry_count; - unsigned int retry_count_rtscts; - - bool retry_updated; - u8 sample_skipped; -}; - struct minstrel_mcs_group_data { u8 index; u8 column; -- cgit v1.2.3 From 5935839ad73583781b8bbe8d91412f6826e218a4 Mon Sep 17 00:00:00 2001 From: Thomas Huehn Date: Tue, 9 Sep 2014 23:22:14 +0200 Subject: mac80211: improve minstrel_ht rate sorting by throughput & probability This patch improves the way minstrel_ht sorts rates according to throughput and success probability. 3 FOR-loops across the entire rate and mcs group set in function minstrel_ht_update_stats() which where used to determine the fastest, second fastest and most robust rate are reduced to 2 FOR-loop. The sorted list of rates according throughput is extended to the best four rates as we need them in upcoming joint rate and power control. The sorting is done via the new function minstrel_ht_sort_best_tp_rates(). The annotation of those 4 best throughput rates in the debugfs file rc-stats is changes to: "A,B,C,D", where A is the fastest rate and C the 4th fastest. Signed-off-by: Thomas Huehn Tested-by: Stefan Venz Acked-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 303 +++++++++++++++++++---------- net/mac80211/rc80211_minstrel_ht.h | 19 +- net/mac80211/rc80211_minstrel_ht_debugfs.c | 10 +- 3 files changed, 213 insertions(+), 119 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 85c1e74b7714..df90ce2db00c 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -135,7 +135,7 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); static int minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) { - return GROUP_IDX((rate->idx / 8) + 1, + return GROUP_IDX((rate->idx / MCS_GROUP_RATES) + 1, !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)); } @@ -232,13 +232,152 @@ minstrel_ht_calc_tp(struct minstrel_ht_sta *mi, int group, int rate) mr->cur_tp = MINSTREL_TRUNC(tp); } +/* + * Find & sort topmost throughput rates + * + * If multiple rates provide equal throughput the sorting is based on their + * current success probability. Higher success probability is preferred among + * MCS groups, CCK rates do not provide aggregation and are therefore at last. + */ +static void +minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u8 index, + u8 *tp_list) +{ + int cur_group, cur_idx, cur_thr, cur_prob; + int tmp_group, tmp_idx, tmp_thr, tmp_prob; + int j = MAX_THR_RATES; + + cur_group = index / MCS_GROUP_RATES; + cur_idx = index % MCS_GROUP_RATES; + cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp; + cur_prob = mi->groups[cur_group].rates[cur_idx].probability; + + tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; + tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; + tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + + while (j > 0 && (cur_thr > tmp_thr || + (cur_thr == tmp_thr && cur_prob > tmp_prob))) { + j--; + tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; + tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; + tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + } + + if (j < MAX_THR_RATES - 1) { + memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * + (MAX_THR_RATES - (j + 1)))); + } + if (j < MAX_THR_RATES) + tp_list[j] = index; +} + +/* + * Find and set the topmost probability rate per sta and per group + */ +static void +minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u8 index) +{ + struct minstrel_mcs_group_data *mg; + struct minstrel_rate_stats *mr; + int tmp_group, tmp_idx, tmp_tp, tmp_prob, max_tp_group; + + mg = &mi->groups[index / MCS_GROUP_RATES]; + mr = &mg->rates[index % MCS_GROUP_RATES]; + + tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; + tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; + tmp_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + + /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from + * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ + max_tp_group = mi->max_tp_rate[0] / MCS_GROUP_RATES; + if((index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) && + (max_tp_group != MINSTREL_CCK_GROUP)) + return; + + if (mr->probability > MINSTREL_FRAC(75, 100)) { + if (mr->cur_tp > tmp_tp) + mi->max_prob_rate = index; + if (mr->cur_tp > mg->rates[mg->max_group_prob_rate].cur_tp) + mg->max_group_prob_rate = index; + } else { + if (mr->probability > tmp_prob) + mi->max_prob_rate = index; + if (mr->probability > mg->rates[mg->max_group_prob_rate].probability) + mg->max_group_prob_rate = index; + } +} + + +/* + * Assign new rate set per sta and use CCK rates only if the fastest + * rate (max_tp_rate[0]) is from CCK group. This prohibits such sorted + * rate sets where MCS and CCK rates are mixed, because CCK rates can + * not use aggregation. + */ +static void +minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, + u8 tmp_mcs_tp_rate[MAX_THR_RATES], + u8 tmp_cck_tp_rate[MAX_THR_RATES]) +{ + unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp; + int i; + + tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; + tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; + tmp_cck_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + + tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; + tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; + tmp_mcs_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + + if (tmp_cck_tp > tmp_mcs_tp) { + for(i = 0; i < MAX_THR_RATES; i++) { + minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], + tmp_mcs_tp_rate); + } + } + +} + +/* + * Try to increase robustness of max_prob rate by decrease number of + * streams if possible. + */ +static inline void +minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) +{ + struct minstrel_mcs_group_data *mg; + struct minstrel_rate_stats *mr; + int tmp_max_streams, group; + int tmp_tp = 0; + + tmp_max_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / + MCS_GROUP_RATES].streams; + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + mg = &mi->groups[group]; + if (!mg->supported || group == MINSTREL_CCK_GROUP) + continue; + mr = minstrel_get_ratestats(mi, mg->max_group_prob_rate); + if (tmp_tp < mr->cur_tp && + (minstrel_mcs_groups[group].streams < tmp_max_streams)) { + mi->max_prob_rate = mg->max_group_prob_rate; + tmp_tp = mr->cur_tp; + } + } +} + /* * Update rate statistics and select new primary rates * * Rules for rate selection: * - max_prob_rate must use only one stream, as a tradeoff between delivery * probability and throughput during strong fluctuations - * - as long as the max prob rate has a probability of more than 3/4, pick + * - as long as the max prob rate has a probability of more than 75%, pick * higher throughput rates, even if the probablity is a bit lower */ static void @@ -246,9 +385,9 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mr; - int cur_prob, cur_prob_tp, cur_tp, cur_tp2; - int group, i, index; - bool mi_rates_valid = false; + int group, i, j; + u8 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; + u8 tmp_cck_tp_rate[MAX_THR_RATES], index; if (mi->ampdu_packets > 0) { mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, @@ -260,13 +399,14 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mi->sample_slow = 0; mi->sample_count = 0; - for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { - bool mg_rates_valid = false; + /* Initialize global rate indexes */ + for(j = 0; j < MAX_THR_RATES; j++){ + tmp_mcs_tp_rate[j] = 0; + tmp_cck_tp_rate[j] = 0; + } - cur_prob = 0; - cur_prob_tp = 0; - cur_tp = 0; - cur_tp2 = 0; + /* Find best rate sets within all MCS groups*/ + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; if (!mg->supported) @@ -274,24 +414,16 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mi->sample_count++; + /* (re)Initialize group rate indexes */ + for(j = 0; j < MAX_THR_RATES; j++) + tmp_group_tp_rate[j] = group; + for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mg->supported & BIT(i))) continue; index = MCS_GROUP_RATES * group + i; - /* initialize rates selections starting indexes */ - if (!mg_rates_valid) { - mg->max_tp_rate = mg->max_tp_rate2 = - mg->max_prob_rate = i; - if (!mi_rates_valid) { - mi->max_tp_rate = mi->max_tp_rate2 = - mi->max_prob_rate = index; - mi_rates_valid = true; - } - mg_rates_valid = true; - } - mr = &mg->rates[i]; mr->retry_updated = false; minstrel_calc_rate_ewma(mr); @@ -300,82 +432,47 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!mr->cur_tp) continue; - if ((mr->cur_tp > cur_prob_tp && mr->probability > - MINSTREL_FRAC(3, 4)) || mr->probability > cur_prob) { - mg->max_prob_rate = index; - cur_prob = mr->probability; - cur_prob_tp = mr->cur_tp; - } - - if (mr->cur_tp > cur_tp) { - swap(index, mg->max_tp_rate); - cur_tp = mr->cur_tp; - mr = minstrel_get_ratestats(mi, index); - } - - if (index >= mg->max_tp_rate) - continue; - - if (mr->cur_tp > cur_tp2) { - mg->max_tp_rate2 = index; - cur_tp2 = mr->cur_tp; + /* Find max throughput rate set */ + if (group != MINSTREL_CCK_GROUP) { + minstrel_ht_sort_best_tp_rates(mi, index, + tmp_mcs_tp_rate); + } else if (group == MINSTREL_CCK_GROUP) { + minstrel_ht_sort_best_tp_rates(mi, index, + tmp_cck_tp_rate); } - } - } - /* try to sample all available rates during each interval */ - mi->sample_count *= 8; + /* Find max throughput rate set within a group */ + minstrel_ht_sort_best_tp_rates(mi, index, + tmp_group_tp_rate); - cur_prob = 0; - cur_prob_tp = 0; - cur_tp = 0; - cur_tp2 = 0; - for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { - mg = &mi->groups[group]; - if (!mg->supported) - continue; - - mr = minstrel_get_ratestats(mi, mg->max_tp_rate); - if (cur_tp < mr->cur_tp) { - mi->max_tp_rate2 = mi->max_tp_rate; - cur_tp2 = cur_tp; - mi->max_tp_rate = mg->max_tp_rate; - cur_tp = mr->cur_tp; - mi->max_prob_streams = minstrel_mcs_groups[group].streams - 1; + /* Find max probability rate per group and global */ + minstrel_ht_set_best_prob_rate(mi, index); } - mr = minstrel_get_ratestats(mi, mg->max_tp_rate2); - if (cur_tp2 < mr->cur_tp) { - mi->max_tp_rate2 = mg->max_tp_rate2; - cur_tp2 = mr->cur_tp; - } + memcpy(mg->max_group_tp_rate, tmp_group_tp_rate, + sizeof(mg->max_group_tp_rate)); } - if (mi->max_prob_streams < 1) - mi->max_prob_streams = 1; + /* Assign new rate set per sta */ + minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_cck_tp_rate); + memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate)); - for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { - mg = &mi->groups[group]; - if (!mg->supported) - continue; - mr = minstrel_get_ratestats(mi, mg->max_prob_rate); - if (cur_prob_tp < mr->cur_tp && - minstrel_mcs_groups[group].streams <= mi->max_prob_streams) { - mi->max_prob_rate = mg->max_prob_rate; - cur_prob = mr->cur_prob; - cur_prob_tp = mr->cur_tp; - } - } + /* Try to increase robustness of max_prob_rate*/ + minstrel_ht_prob_rate_reduce_streams(mi); + + /* try to sample all available rates during each interval */ + mi->sample_count *= 8; #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ if (mp->fixed_rate_idx != -1) { - mi->max_tp_rate = mp->fixed_rate_idx; - mi->max_tp_rate2 = mp->fixed_rate_idx; + for (i = 0; i < 4; i++) + mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; } #endif + /* Reset update timer */ mi->stats_update = jiffies; } @@ -420,8 +517,7 @@ minstrel_next_sample_idx(struct minstrel_ht_sta *mi) } static void -minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx, - bool primary) +minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u8 *idx, bool primary) { int group, orig_group; @@ -437,9 +533,9 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx, continue; if (primary) - *idx = mi->groups[group].max_tp_rate; + *idx = mi->groups[group].max_group_tp_rate[0]; else - *idx = mi->groups[group].max_tp_rate2; + *idx = mi->groups[group].max_group_tp_rate[1]; break; } } @@ -524,19 +620,19 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, * check for sudden death of spatial multiplexing, * downgrade to a lower number of streams if necessary. */ - rate = minstrel_get_ratestats(mi, mi->max_tp_rate); + rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && MINSTREL_FRAC(rate->success, rate->attempts) < MINSTREL_FRAC(20, 100)) { - minstrel_downgrade_rate(mi, &mi->max_tp_rate, true); + minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } - rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate2); + rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && MINSTREL_FRAC(rate2->success, rate2->attempts) < MINSTREL_FRAC(20, 100)) { - minstrel_downgrade_rate(mi, &mi->max_tp_rate2, false); + minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } @@ -661,12 +757,12 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!rates) return; - /* Start with max_tp_rate */ - minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate); + /* Start with max_tp_rate[0] */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); if (mp->hw->max_rates >= 3) { - /* At least 3 tx rates supported, use max_tp_rate2 next */ - minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate2); + /* At least 3 tx rates supported, use max_tp_rate[1] next */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[1]); } if (mp->hw->max_rates >= 2) { @@ -691,7 +787,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_rate_stats *mr; struct minstrel_mcs_group_data *mg; - unsigned int sample_dur, sample_group; + unsigned int sample_dur, sample_group, cur_max_tp_streams; int sample_idx = 0; if (mi->sample_wait > 0) { @@ -718,8 +814,8 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * to the frame. Hence, don't use sampling for the currently * used rates. */ - if (sample_idx == mi->max_tp_rate || - sample_idx == mi->max_tp_rate2 || + if (sample_idx == mi->max_tp_rate[0] || + sample_idx == mi->max_tp_rate[1] || sample_idx == mi->max_prob_rate) return -1; @@ -734,9 +830,12 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * Make sure that lower rates get sampled only occasionally, * if the link is working perfectly. */ + + cur_max_tp_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / + MCS_GROUP_RATES].streams; sample_dur = minstrel_get_duration(sample_idx); - if (sample_dur >= minstrel_get_duration(mi->max_tp_rate2) && - (mi->max_prob_streams < + if (sample_dur >= minstrel_get_duration(mi->max_tp_rate[1]) && + (cur_max_tp_streams - 1 < minstrel_mcs_groups[sample_group].streams || sample_dur >= minstrel_get_duration(mi->max_prob_rate))) { if (mr->sample_skipped < 20) @@ -1041,8 +1140,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) if (!msp->is_ht) return mac80211_minstrel.get_expected_throughput(priv_sta); - i = mi->max_tp_rate / MCS_GROUP_RATES; - j = mi->max_tp_rate % MCS_GROUP_RATES; + i = mi->max_tp_rate[0] / MCS_GROUP_RATES; + j = mi->max_tp_rate[0] % MCS_GROUP_RATES; /* convert cur_tp from pkt per second in kbps */ return mi->groups[i].rates[j].cur_tp * AVG_PKT_SIZE * 8 / 1024; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 5fee938b11c6..01570e0e014b 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -33,10 +33,9 @@ struct minstrel_mcs_group_data { /* bitfield of supported MCS rates of this group */ u8 supported; - /* selected primary rates */ - unsigned int max_tp_rate; - unsigned int max_tp_rate2; - unsigned int max_prob_rate; + /* sorted rate set within a MCS group*/ + u8 max_group_tp_rate[MAX_THR_RATES]; + u8 max_group_prob_rate; /* MCS rate statistics */ struct minstrel_rate_stats rates[MCS_GROUP_RATES]; @@ -52,15 +51,9 @@ struct minstrel_ht_sta { /* ampdu length (EWMA) */ unsigned int avg_ampdu_len; - /* best throughput rate */ - unsigned int max_tp_rate; - - /* second best throughput rate */ - unsigned int max_tp_rate2; - - /* best probability rate */ - unsigned int max_prob_rate; - unsigned int max_prob_streams; + /* overall sorted rate set */ + u8 max_tp_rate[MAX_THR_RATES]; + u8 max_prob_rate; /* time of last status update */ unsigned long stats_update; diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 3e7d793de0c3..a72ad46f2a04 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -46,8 +46,10 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) else p += sprintf(p, "HT%c0/%cGI ", htmode, gimode); - *(p++) = (idx == mi->max_tp_rate) ? 'T' : ' '; - *(p++) = (idx == mi->max_tp_rate2) ? 't' : ' '; + *(p++) = (idx == mi->max_tp_rate[0]) ? 'A' : ' '; + *(p++) = (idx == mi->max_tp_rate[1]) ? 'B' : ' '; + *(p++) = (idx == mi->max_tp_rate[2]) ? 'C' : ' '; + *(p++) = (idx == mi->max_tp_rate[3]) ? 'D' : ' '; *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; if (i == max_mcs) { @@ -100,8 +102,8 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; - p += sprintf(p, "type rate throughput ewma prob this prob " - "retry this succ/attempt success attempts\n"); + p += sprintf(p, "type rate throughput ewma prob " + "this prob retry this succ/attempt success attempts\n"); p = minstrel_ht_stats_dump(mi, max_mcs, p); for (i = 0; i < max_mcs; i++) -- cgit v1.2.3 From 9d58f25b12f70f52581a5a1fdb1e59d322c4c729 Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Thu, 4 Sep 2014 08:28:40 +0300 Subject: mac80211: add TDLS connection timeout Adding a timeout for tearing down a TDLS connection that hasn't had ACKed traffic sent through it for a certain amount of time. Since we have no other monitoring facility to indicate the existance (or non-existance) of a peer, this patch will cause a peer to be considered as unavailable if for some X time at least some Y packets have all not been ACKed. Signed-off-by: Liad Kaufman Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/sta_info.h | 3 +++ net/mac80211/status.c | 21 ++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9dc7a9d6b0d2..42f68cb8957e 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -450,6 +450,9 @@ struct sta_info { enum ieee80211_smps_mode known_smps_mode; const struct ieee80211_cipher_scheme *cipher_scheme; + /* TDLS timeout data */ + unsigned long last_tdls_pkt_time; + /* keep last! */ struct ieee80211_sta sta; }; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 2800e1c65519..89290e33dafe 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -538,6 +538,8 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 +#define STA_LOST_TDLS_PKT_THRESHOLD 10 +#define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct sk_buff *skb) { @@ -548,7 +550,20 @@ static void ieee80211_lost_packet(struct sta_info *sta, struct sk_buff *skb) !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; - if (++sta->lost_packets < STA_LOST_PKT_THRESHOLD) + sta->lost_packets++; + if (!sta->sta.tdls && sta->lost_packets < STA_LOST_PKT_THRESHOLD) + return; + + /* + * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD + * of the last packets were lost, and that no ACK was received in the + * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss + * mechanism. + */ + if (sta->sta.tdls && + (sta->lost_packets < STA_LOST_TDLS_PKT_THRESHOLD || + time_before(jiffies, + sta->last_tdls_pkt_time + STA_LOST_TDLS_PKT_TIME))) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, @@ -695,6 +710,10 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->lost_packets) sta->lost_packets = 0; + + /* Track when last TDLS packet was ACKed */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) + sta->last_tdls_pkt_time = jiffies; } else { ieee80211_lost_packet(sta, skb); } -- cgit v1.2.3 From 960d01acf62747d6518694f92be5b06f67473833 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Sep 2014 22:55:35 +0300 Subject: cfg80211: add WMM traffic stream API Add nl80211 and driver API to validate, add and delete traffic streams with appropriate settings. The API calls for userspace doing the action frame handshake with the peer, and then allows only to set up the parameters in the driver. To avoid setting up a session only to tear it down again, the validate API is provided, but the real usage later can still fail so userspace must be prepared for that. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++ include/net/cfg80211.h | 23 ++++++++- include/uapi/linux/nl80211.h | 28 +++++++++++ net/wireless/nl80211.c | 109 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 31 ++++++++++++ net/wireless/trace.h | 45 ++++++++++++++++++ 6 files changed, 239 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 61a09f0644aa..b1be39c76931 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -166,8 +166,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_MESH_ID_LEN 32 +#define IEEE80211_FIRST_TSPEC_TSID 8 #define IEEE80211_NUM_TIDS 16 +/* number of user priorities 802.11 uses */ +#define IEEE80211_NUM_UPS 8 + #define IEEE80211_QOS_CTL_LEN 2 /* 1d tag mask */ #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c2c710c14a50..a13beab123dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2318,6 +2318,17 @@ struct cfg80211_qos_map { * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the * given interface This is used e.g. for dynamic HT 20/40 MHz channel width * changes during the lifetime of the BSS. + * + * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device + * with the given parameters; action frame exchange has been handled by + * userspace so this just has to modify the TX path to take the TS into + * account. + * If the admitted time is 0 just validate the parameters to make sure + * the session can be created at all; it is valid to just always return + * success for that but that may result in inefficient behaviour (handshake + * with the peer followed by immediate teardown when the addition is later + * rejected) + * @del_tx_ts: remove an existing TX TS */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2558,6 +2569,12 @@ struct cfg80211_ops { int (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef); + + int (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer, u8 user_prio, + u16 admitted_time); + int (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer); }; /* @@ -2604,9 +2621,13 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). + * @WIPHY_FLAG_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM + * TSPEC sessions (TID aka TSID 0-7) with the NL80211_CMD_ADD_TX_TS + * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it + * needs to be able to handle Block-Ack agreements and other things. */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 29c4399e08b9..e5b8caf5e466 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -722,6 +722,22 @@ * QoS mapping is relevant for IP packets, it is only valid during an * association. This is cleared on disassociation and AP restart. * + * @NL80211_CMD_ADD_TX_TS: Ask the kernel to add a traffic stream for the given + * %NL80211_ATTR_TSID and %NL80211_ATTR_MAC with %NL80211_ATTR_USER_PRIO + * and %NL80211_ATTR_ADMITTED_TIME parameters. + * Note that the action frame handshake with the AP shall be handled by + * userspace via the normal management RX/TX framework, this only sets + * up the TX TS in the driver/device. + * If the admitted time attribute is not added then the request just checks + * if a subsequent setup could be successful, the intent is to use this to + * avoid setting up a session with the AP when local restrictions would + * make that impossible. However, the subsequent "real" setup may still + * fail even if the check was successful. + * @NL80211_CMD_DEL_TX_TS: Remove an existing TS with the %NL80211_ATTR_TSID + * and %NL80211_ATTR_MAC parameters. It isn't necessary to call this + * before removing a station entry entirely, or before disassociating + * or similar, cleanup will happen in the driver/device in this case. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -893,6 +909,9 @@ enum nl80211_commands { NL80211_CMD_SET_QOS_MAP, + NL80211_CMD_ADD_TX_TS, + NL80211_CMD_DEL_TX_TS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1611,6 +1630,11 @@ enum nl80211_commands { * drivers to indicate dynack capability. Dynack is automatically disabled * setting valid value for coverage class. * + * @NL80211_ATTR_TSID: a TSID value (u8 attribute) + * @NL80211_ATTR_USER_PRIO: user priority value (u8 attribute) + * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds + * (per second) (u16 attribute) + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1957,6 +1981,10 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_DYN_ACK, + NL80211_ATTR_TSID, + NL80211_ATTR_USER_PRIO, + NL80211_ATTR_ADMITTED_TIME, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e9fbd4f4ddb0..ab7ee4893e40 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -391,6 +391,9 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, + [NL80211_ATTR_TSID] = { .type = NLA_U8 }, + [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, + [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1510,6 +1513,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); CMD(set_qos_map, SET_QOS_MAP); + if (rdev->wiphy.flags & + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION) + CMD(add_tx_ts, ADD_TX_TS); } /* add into the if now */ #undef CMD @@ -9390,6 +9396,93 @@ static int nl80211_set_qos_map(struct sk_buff *skb, return ret; } +static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid, up; + u16 admitted_time = 0; + int err; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_USER_PRIO]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + if (tsid >= IEEE80211_NUM_TIDS) + return -EINVAL; + + up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); + if (up >= IEEE80211_NUM_UPS) + return -EINVAL; + + /* WMM uses TIDs 0-7 even for TSPEC */ + if (tsid < IEEE80211_FIRST_TSPEC_TSID) { + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EINVAL; + } else { + /* TODO: handle 802.11 TSPEC/admission control + * need more attributes for that (e.g. BA session requirement) + */ + return -EINVAL; + } + + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) { + admitted_time = + nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]); + if (!admitted_time) + return -EINVAL; + } + + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time); + + out: + wdev_unlock(wdev); + return err; +} + +static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid; + int err; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + wdev_lock(wdev); + err = rdev_del_tx_ts(rdev, dev, tsid, peer); + wdev_unlock(wdev); + + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -10147,6 +10240,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ADD_TX_TS, + .doit = nl80211_add_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_TX_TS, + .doit = nl80211_del_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 56c2240c30ce..f6d457d6a558 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -915,4 +915,35 @@ rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_add_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer, + u8 user_prio, u16 admitted_time) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + if (rdev->ops->add_tx_ts) + ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_del_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); + if (rdev->ops->del_tx_ts) + ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 0c524cd76c83..625a6e6d1168 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1896,6 +1896,51 @@ TRACE_EVENT(rdev_set_ap_chanwidth, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(rdev_add_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time), + TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + __field(u8, user_prio) + __field(u16, admitted_time) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + __entry->user_prio = user_prio; + __entry->admitted_time = admitted_time; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->tsid, __entry->user_prio, __entry->admitted_time) +); + +TRACE_EVENT(rdev_del_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer), + TP_ARGS(wiphy, netdev, tsid, peer), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3 From b0b6aa2c8e0d0e34f7658d5cc1e4fbb59f701c42 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 9 Sep 2014 17:09:45 +0300 Subject: cfg80211/mac80211: add wmm info to assoc event Userspace might need to know what queues are configured for uapsd (e.g. for setting proper default values in tspecs). Add this bitmap to the association event (inside wmm nested attribute) Add additional parameter to cfg80211_rx_assoc_resp, and update its callers. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- net/mac80211/mlme.c | 9 ++++++++- net/wireless/mlme.c | 4 ++-- net/wireless/nl80211.c | 28 +++++++++++++++++++++------- net/wireless/nl80211.h | 3 ++- 5 files changed, 36 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a13beab123dc..2ed1f80f7eb6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3944,6 +3944,7 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * moves to cfg80211 in this call * @buf: authentication frame (header + body) * @len: length of the frame data + * @uapsd_queues: bitmap of ACs configured to uapsd. -1 if n/a. * * After being asked to associate via cfg80211_ops::assoc() the driver must * call either this function or cfg80211_auth_timeout(). @@ -3952,7 +3953,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); */ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *buf, size_t len); + const u8 *buf, size_t len, + int uapsd_queues); /** * cfg80211_assoc_timeout - notification of timed out association diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a7b92f5f7161..efa41fc66c1f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2817,6 +2817,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee802_11_elems elems; + int ac, uapsd_queues = -1; u8 *pos; bool reassoc; struct cfg80211_bss *bss; @@ -2886,9 +2887,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, * is set can cause the interface to go idle */ ieee80211_destroy_assoc_data(sdata, true); + + /* get uapsd queues configuration */ + uapsd_queues = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + if (sdata->tx_conf[ac].uapsd) + uapsd_queues |= BIT(ac); } - cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len); + cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 369fc334fdad..2c52b59e43f3 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -19,7 +19,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *buf, size_t len) + const u8 *buf, size_t len, int uapsd_queues) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -43,7 +43,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, return; } - nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL); + nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, status_code, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ab7ee4893e40..e8114c7a37e3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10524,7 +10524,8 @@ nla_put_failure: static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, - enum nl80211_commands cmd, gfp_t gfp) + enum nl80211_commands cmd, gfp_t gfp, + int uapsd_queues) { struct sk_buff *msg; void *hdr; @@ -10544,6 +10545,19 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_FRAME, len, buf)) goto nla_put_failure; + if (uapsd_queues >= 0) { + struct nlattr *nla_wmm = + nla_nest_start(msg, NL80211_ATTR_STA_WME); + if (!nla_wmm) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_STA_WME_UAPSD_QUEUES, + uapsd_queues)) + goto nla_put_failure; + + nla_nest_end(msg, nla_wmm); + } + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -10560,15 +10574,15 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_AUTHENTICATE, gfp); + NL80211_CMD_AUTHENTICATE, gfp, -1); } void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) + size_t len, gfp_t gfp, int uapsd_queues) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_ASSOCIATE, gfp); + NL80211_CMD_ASSOCIATE, gfp, uapsd_queues); } void nl80211_send_deauth(struct cfg80211_registered_device *rdev, @@ -10576,7 +10590,7 @@ void nl80211_send_deauth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DEAUTHENTICATE, gfp); + NL80211_CMD_DEAUTHENTICATE, gfp, -1); } void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, @@ -10584,7 +10598,7 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DISASSOCIATE, gfp); + NL80211_CMD_DISASSOCIATE, gfp, -1); } void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, @@ -10605,7 +10619,7 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, cmd = NL80211_CMD_UNPROT_DISASSOCIATE; trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); - nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC); + nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1); } EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 49c9a482dd12..7ad70d6f0cc6 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -23,7 +23,8 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, gfp_t gfp, + int uapsd_queues); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); -- cgit v1.2.3 From 59cd85cbcf2ecca9736652dc0cfd2ec600d7ef2a Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Tue, 9 Sep 2014 17:11:02 +0300 Subject: mac80211: set network header in TDLS frames Correctly mark the network header location in mac80211-generated TDLS frames. These may be used by lower-level drivers. Signed-off-by: Arik Nemtsov Reviewed-by: Johannes Berg Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index bcf51e43ccc1..4ea25dec0698 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -412,6 +412,9 @@ ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, tf->ether_type = cpu_to_be16(ETH_P_TDLS); tf->payload_type = WLAN_TDLS_SNAP_RFTYPE; + /* network header is after the ethernet header */ + skb_set_network_header(skb, ETH_HLEN); + switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: tf->category = WLAN_CATEGORY_TDLS; -- cgit v1.2.3 From 18998c381b19bfc3c285361ff6200ded7444aa2c Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Sep 2014 14:07:34 +0300 Subject: cfg80211: allow requesting SMPS mode on ap start Add feature bits to indicate device support for static-smps and dynamic-smps modes. Add a new NL80211_ATTR_SMPS_MODE attribue to allow configuring the smps mode to be used by the ap (e.g. configuring to ap to dynamic smps mode will reduce power consumption while having minor effect on throughput) Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 33 +++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 24 ++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2ed1f80f7eb6..a2ddcf2398fd 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -664,6 +664,7 @@ struct cfg80211_acl_data { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) + * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -682,6 +683,7 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; + enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e5b8caf5e466..4b28dc07bcb1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1635,6 +1635,9 @@ enum nl80211_commands { * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds * (per second) (u16 attribute) * + * @NL80211_ATTR_SMPS_MODE: SMPS mode to use (ap mode). see + * &enum nl80211_smps_mode. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1985,6 +1988,8 @@ enum nl80211_attrs { NL80211_ATTR_USER_PRIO, NL80211_ATTR_ADMITTED_TIME, + NL80211_ATTR_SMPS_MODE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4030,6 +4035,13 @@ enum nl80211_ap_sme_features { * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used * to enable dynack. + * @NL80211_FEATURE_STATIC_SMPS: Device supports static spatial + * multiplexing powersave, ie. can turn off all but one chain + * even on HT connections that should be using more chains. + * @NL80211_FEATURE_DYNAMIC_SMPS: Device supports dynamic spatial + * multiplexing powersave, ie. can turn off all but one chain + * and then wake the rest up as required after, for example, + * rts/cts handshake. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4056,6 +4068,8 @@ enum nl80211_feature_flags { NL80211_FEATURE_QUIET = 1 << 21, NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, + NL80211_FEATURE_STATIC_SMPS = 1 << 24, + NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, }; /** @@ -4129,6 +4143,25 @@ enum nl80211_acl_policy { NL80211_ACL_POLICY_DENY_UNLESS_LISTED, }; +/** + * enum nl80211_smps_mode - SMPS mode + * + * Requested SMPS mode (for AP mode) + * + * @NL80211_SMPS_OFF: SMPS off (use all antennas). + * @NL80211_SMPS_STATIC: static SMPS (use a single antenna) + * @NL80211_SMPS_DYNAMIC: dynamic smps (start with a single antenna and + * turn on other antennas after CTS/RTS). + */ +enum nl80211_smps_mode { + NL80211_SMPS_OFF, + NL80211_SMPS_STATIC, + NL80211_SMPS_DYNAMIC, + + __NL80211_SMPS_AFTER_LAST, + NL80211_SMPS_MAX = __NL80211_SMPS_AFTER_LAST - 1 +}; + /** * enum nl80211_radar_event - type of radar event for DFS operation * diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e8114c7a37e3..4cce3e17964d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -394,6 +394,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TSID] = { .type = NLA_U8 }, [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, + [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -3345,6 +3346,29 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + if (info->attrs[NL80211_ATTR_SMPS_MODE]) { + params.smps_mode = + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); + switch (params.smps_mode) { + case NL80211_SMPS_OFF: + break; + case NL80211_SMPS_STATIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_STATIC_SMPS)) + return -EINVAL; + break; + case NL80211_SMPS_DYNAMIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_DYNAMIC_SMPS)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } else { + params.smps_mode = NL80211_SMPS_OFF; + } + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { -- cgit v1.2.3 From f69931748730763f8c8095fa88394cf9af75a578 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Sep 2014 14:07:35 +0300 Subject: mac80211: set smps_mode according to ap params Take the requested smps mode from the ap params (instead of always starting with SMPS_OFF) Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 101ae6cfad81..fb6a1502b6df 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -683,8 +683,19 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) return -EALREADY; - /* TODO: make hostapd tell us what it wants */ - sdata->smps_mode = IEEE80211_SMPS_OFF; + switch (params->smps_mode) { + case NL80211_SMPS_OFF: + sdata->smps_mode = IEEE80211_SMPS_OFF; + break; + case NL80211_SMPS_STATIC: + sdata->smps_mode = IEEE80211_SMPS_STATIC; + break; + case NL80211_SMPS_DYNAMIC: + sdata->smps_mode = IEEE80211_SMPS_DYNAMIC; + break; + default: + return -EINVAL; + } sdata->needed_rx_chains = sdata->local->rx_chains; mutex_lock(&local->mtx); -- cgit v1.2.3 From 0d8614b4b926d0f657d15d7eb5125bcb24b9fd41 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Sep 2014 14:07:36 +0300 Subject: mac80211: replace SMPS hw flags with wiphy feature bits Use the new static_smps / dynamic_smps feature bits instead of mac80211-internal hw flags. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 5 +++-- drivers/net/wireless/iwlegacy/4965-mac.c | 5 ++--- drivers/net/wireless/iwlwifi/dvm/mac80211.c | 4 ++-- drivers/net/wireless/iwlwifi/mvm/mac80211.c | 8 ++++---- drivers/net/wireless/mac80211_hwsim.c | 8 ++++---- include/net/mac80211.h | 13 +------------ net/mac80211/debugfs.c | 5 ----- net/mac80211/debugfs_netdev.c | 4 ++-- net/mac80211/mlme.c | 2 +- 9 files changed, 19 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 9d61bb157189..4b5f0011ab5d 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4759,7 +4759,6 @@ int ath10k_mac_register(struct ath10k *ar) IEEE80211_HW_MFP_CAPABLE | IEEE80211_HW_REPORTS_TX_ACK_STATUS | IEEE80211_HW_HAS_RATE_CONTROL | - IEEE80211_HW_SUPPORTS_STATIC_SMPS | IEEE80211_HW_AP_LINK_PS | IEEE80211_HW_SPECTRUM_MGMT; @@ -4767,8 +4766,10 @@ int ath10k_mac_register(struct ath10k *ar) * bytes is used for padding/alignment if necessary. */ ar->hw->extra_tx_headroom += sizeof(struct htt_data_tx_desc_frag)*2 + 4; + ar->hw->wiphy->features |= NL80211_FEATURE_STATIC_SMPS; + if (ar->ht_cap_info & WMI_HT_CAP_DYNAMIC_SMPS) - ar->hw->flags |= IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS; + ar->hw->wiphy->features |= NL80211_FEATURE_DYNAMIC_SMPS; if (ar->ht_cap_info & WMI_HT_CAP_ENABLED) { ar->hw->flags |= IEEE80211_HW_AMPDU_AGGREGATION; diff --git a/drivers/net/wireless/iwlegacy/4965-mac.c b/drivers/net/wireless/iwlegacy/4965-mac.c index 3dcbe2cd2b28..9f930a0cba09 100644 --- a/drivers/net/wireless/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/iwlegacy/4965-mac.c @@ -5757,9 +5757,8 @@ il4965_mac_setup_register(struct il_priv *il, u32 max_probe_length) IEEE80211_HW_REPORTS_TX_ACK_STATUS | IEEE80211_HW_SUPPORTS_PS | IEEE80211_HW_SUPPORTS_DYNAMIC_PS; if (il->cfg->sku & IL_SKU_N) - hw->flags |= - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | - IEEE80211_HW_SUPPORTS_STATIC_SMPS; + hw->wiphy->features |= NL80211_FEATURE_DYNAMIC_SMPS | + NL80211_FEATURE_STATIC_SMPS; hw->sta_data_size = sizeof(struct il_station_priv); hw->vif_data_size = sizeof(struct il_vif_priv); diff --git a/drivers/net/wireless/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/iwlwifi/dvm/mac80211.c index afb98f4fdaf3..2364a3c09b9e 100644 --- a/drivers/net/wireless/iwlwifi/dvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/dvm/mac80211.c @@ -125,8 +125,8 @@ int iwlagn_mac_setup_register(struct iwl_priv *priv, */ if (priv->nvm_data->sku_cap_11n_enable) - hw->flags |= IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | - IEEE80211_HW_SUPPORTS_STATIC_SMPS; + hw->wiphy->features |= NL80211_FEATURE_DYNAMIC_SMPS | + NL80211_FEATURE_STATIC_SMPS; /* * Enable 11w if advertised by firmware and software crypto diff --git a/drivers/net/wireless/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/iwlwifi/mvm/mac80211.c index 7c8796584c25..395fc05638c2 100644 --- a/drivers/net/wireless/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/mvm/mac80211.c @@ -301,9 +301,7 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) IEEE80211_HW_AMPDU_AGGREGATION | IEEE80211_HW_TIMING_BEACON_ONLY | IEEE80211_HW_CONNECTION_MONITOR | - IEEE80211_HW_CHANCTX_STA_CSA | - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | - IEEE80211_HW_SUPPORTS_STATIC_SMPS; + IEEE80211_HW_CHANCTX_STA_CSA; hw->queues = mvm->first_agg_queue; hw->offchannel_tx_hw_queue = IWL_MVM_OFFCHANNEL_QUEUE; @@ -405,7 +403,9 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) hw->wiphy->features |= NL80211_FEATURE_P2P_GO_CTWIN | NL80211_FEATURE_LOW_PRIORITY_SCAN | - NL80211_FEATURE_P2P_GO_OPPPS; + NL80211_FEATURE_P2P_GO_OPPPS | + NL80211_FEATURE_DYNAMIC_SMPS | + NL80211_FEATURE_STATIC_SMPS; mvm->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 1326f6121835..babbdc1ce741 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2045,8 +2045,6 @@ static int mac80211_hwsim_create_radio(int channels, const char *reg_alpha2, hw->flags = IEEE80211_HW_MFP_CAPABLE | IEEE80211_HW_SIGNAL_DBM | - IEEE80211_HW_SUPPORTS_STATIC_SMPS | - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS | IEEE80211_HW_AMPDU_AGGREGATION | IEEE80211_HW_WANT_MONITOR_VIF | IEEE80211_HW_QUEUE_CONTROL | @@ -2059,8 +2057,10 @@ static int mac80211_hwsim_create_radio(int channels, const char *reg_alpha2, WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | WIPHY_FLAG_AP_UAPSD | WIPHY_FLAG_HAS_CHANNEL_SWITCH; - hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR; - hw->wiphy->features |= NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE; + hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR | + NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | + NL80211_FEATURE_STATIC_SMPS | + NL80211_FEATURE_DYNAMIC_SMPS; /* ask mac80211 to reserve space for magic */ hw->vif_data_size = sizeof(struct hwsim_vif_priv); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c6e6a6804ee4..0ad1f47d2dc7 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1537,16 +1537,6 @@ struct ieee80211_tx_control { * @IEEE80211_HW_MFP_CAPABLE: * Hardware supports management frame protection (MFP, IEEE 802.11w). * - * @IEEE80211_HW_SUPPORTS_STATIC_SMPS: - * Hardware supports static spatial multiplexing powersave, - * ie. can turn off all but one chain even on HT connections - * that should be using more chains. - * - * @IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS: - * Hardware supports dynamic spatial multiplexing powersave, - * ie. can turn off all but one chain and then wake the rest - * up as required after, for example, rts/cts handshake. - * * @IEEE80211_HW_SUPPORTS_UAPSD: * Hardware supports Unscheduled Automatic Power Save Delivery * (U-APSD) in managed mode. The mode is configured with @@ -1632,8 +1622,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS = 1<<12, IEEE80211_HW_MFP_CAPABLE = 1<<13, IEEE80211_HW_WANT_MONITOR_VIF = 1<<14, - IEEE80211_HW_SUPPORTS_STATIC_SMPS = 1<<15, - IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS = 1<<16, + /* free slots */ IEEE80211_HW_SUPPORTS_UAPSD = 1<<17, IEEE80211_HW_REPORTS_TX_ACK_STATUS = 1<<18, IEEE80211_HW_CONNECTION_MONITOR = 1<<19, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 574656174f91..54a189f0393e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -303,11 +303,6 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_STATIC_SMPS\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) - sf += scnprintf(buf + sf, mxln - sf, - "SUPPORTS_DYNAMIC_SMPS\n"); if (local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_UAPSD\n"); if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index e205ebabfa50..c68896adfa96 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -226,12 +226,12 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int err; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) && + if (!(local->hw.wiphy->features & NL80211_FEATURE_STATIC_SMPS) && smps_mode == IEEE80211_SMPS_STATIC) return -EINVAL; /* auto should be dynamic if in PS mode */ - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) && + if (!(local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) && (smps_mode == IEEE80211_SMPS_DYNAMIC || smps_mode == IEEE80211_SMPS_AUTOMATIC)) return -EINVAL; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index efa41fc66c1f..9d4ccb2cf3c9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3805,7 +3805,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len; ifmgd->p2p_noa_index = -1; - if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + if (sdata->local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC; else ifmgd->req_smps = IEEE80211_SMPS_OFF; -- cgit v1.2.3 From 3e8dc212a0e68a9a90c97f34a92c4cdd97d19dd3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 17:42:00 +0200 Subject: netfilter: NFT_CHAIN_NAT_IPV* is independent of NFT_NAT Now that we have masquerading support in nf_tables, the NAT chain can be use with it, not only for SNAT/DNAT. So make this chain type independent of it. While at it, move it inside the scope of 'if NF_NAT_IPV*' to simplify dependencies. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 19 +++++++++---------- net/ipv6/netfilter/Kconfig | 23 +++++++++++++---------- 2 files changed, 22 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d189c5262bdb..eb6995fc4f67 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,16 +61,6 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. -config NFT_CHAIN_NAT_IPV4 - depends on NF_TABLES_IPV4 - depends on NF_NAT_IPV4 && NFT_NAT - tristate "IPv4 nf_tables nat chain support" - help - This option enables the "nat" chain for IPv4 in nf_tables. This - chain type is used to perform Network Address Translation (NAT) - packet transformations such as the source, destination address and - source and destination ports. - config NFT_REJECT_IPV4 depends on NF_TABLES_IPV4 default NFT_REJECT @@ -94,6 +84,15 @@ config NF_NAT_IPV4 if NF_NAT_IPV4 +config NFT_CHAIN_NAT_IPV4 + depends on NF_TABLES_IPV4 + tristate "IPv4 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv4 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index a8f25306a46a..e854062d0c36 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -40,16 +40,6 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. -config NFT_CHAIN_NAT_IPV6 - depends on NF_TABLES_IPV6 - depends on NF_NAT_IPV6 && NFT_NAT - tristate "IPv6 nf_tables nat chain support" - help - This option enables the "nat" chain for IPv6 in nf_tables. This - chain type is used to perform Network Address Translation (NAT) - packet transformations such as the source, destination address and - source and destination ports. - config NFT_REJECT_IPV6 depends on NF_TABLES_IPV6 default NFT_REJECT @@ -70,6 +60,19 @@ config NF_NAT_IPV6 forms of full Network Address Port Translation. This can be controlled by iptables or nft. +if NF_NAT_IPV6 + +config NFT_CHAIN_NAT_IPV6 + depends on NF_TABLES_IPV6 + tristate "IPv6 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv6 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. + +endif # NF_NAT_IPV6 + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 -- cgit v1.2.3 From 0bbe80e571c7b866afd92a98edd32a969467a7a9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 17:51:27 +0200 Subject: netfilter: masquerading needs to be independent of x_tables in Kconfig Users are starting to test nf_tables with no x_tables support. Therefore, masquerading needs to be indenpendent of it from Kconfig. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 27 +++++++++++++++------------ net/ipv6/netfilter/Kconfig | 27 +++++++++++++++------------ 2 files changed, 30 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index eb6995fc4f67..345242a79db6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -93,6 +93,21 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. +config NF_NAT_MASQUERADE_IPV4 + tristate "IPv4 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection). + +config NFT_MASQ_IPV4 + tristate "IPv4 masquerading support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV4 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP @@ -231,18 +246,6 @@ config IP_NF_NAT if IP_NF_NAT -config NF_NAT_MASQUERADE_IPV4 - tristate "IPv4 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection). - -config NFT_MASQ_IPV4 - tristate "IPv4 masquerading support for nf_tables" - depends on NF_TABLES_IPV4 - depends on NFT_MASQ - select NF_NAT_MASQUERADE_IPV4 - config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV4 diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index e854062d0c36..bb1a40db7be1 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -71,6 +71,21 @@ config NFT_CHAIN_NAT_IPV6 packet transformations such as the source, destination address and source and destination ports. +config NF_NAT_MASQUERADE_IPV6 + tristate "IPv6 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection) for IPv6. + +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + endif # NF_NAT_IPV6 config IP6_NF_IPTABLES @@ -261,18 +276,6 @@ config IP6_NF_NAT if IP6_NF_NAT -config NF_NAT_MASQUERADE_IPV6 - tristate "IPv6 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection) for IPv6. - -config NFT_MASQ_IPV6 - tristate "IPv6 masquerade support for nf_tables" - depends on NF_TABLES_IPV6 - depends on NFT_MASQ - select NF_NAT_MASQUERADE_IPV6 - config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV6 -- cgit v1.2.3 From 9a783a139c32a905825ee0aa9597f485ea461f76 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 12 Sep 2014 09:31:52 -0700 Subject: Bluetooth: Fix re-setting RPA as expired when deferring update The hci_update_random_address will clear the RPA_EXPIRED flag and proceed with setting a new one if the flag was set. However, the set_random_addr() function that is called may choose to defer the update to a later moment. In such a case the flag would incorrectly remain unset unless set_random_addr() re-sets it. This patch fixes the issue. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0d3782ad9a5b..067526d9680d 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3872,6 +3872,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) if (test_bit(HCI_LE_ADV, &hdev->dev_flags) || hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { BT_DBG("Deferring random address update"); + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); return; } -- cgit v1.2.3 From cab41c47d92851de71c74b1a7bdbf0fadf6ae4ba Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 10 Sep 2014 18:05:26 -0400 Subject: skb: Add documentation for skb_clone_sk This change adds some documentation to the call skb_clone_sk. This is meant to help clarify the purpose of the function for other developers. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a18dfb02d944..c9da77a95e2d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3511,6 +3511,19 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) } EXPORT_SYMBOL(sock_dequeue_err_skb); +/** + * skb_clone_sk - create clone of skb, and take reference to socket + * @skb: the skb to clone + * + * This function creates a clone of a buffer that holds a reference on + * sk_refcnt. Buffers created via this function are meant to be + * returned using sock_queue_err_skb, or free via kfree_skb. + * + * When passing buffers allocated with this function to sock_queue_err_skb + * it is necessary to wrap the call with sock_hold/sock_put in order to + * prevent the socket from being released prior to being enqueued on + * the sk_error_queue. + */ struct sk_buff *skb_clone_sk(struct sk_buff *skb) { struct sock *sk = skb->sk; -- cgit v1.2.3 From bf7fa551e0ce507b82935055f4b4aa229be73eeb Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 10 Sep 2014 18:05:42 -0400 Subject: mac80211: Resolve sk_refcnt/sk_wmem_alloc issue in wifi ack path There is a possible issue with the use, or lack thereof of sk_refcnt and sk_wmem_alloc in the wifi ack status functionality. Specifically if a socket were to request acknowledgements, and the socket were to have sk_refcnt drop to 0 resulting in it waiting on sk_wmem_alloc to reach 0 it would be possible to have sock_queue_err_skb orphan the last buffer, resulting in __sk_free being called on the socket. After this the buffer is enqueued on sk_error_queue, however the queue has already been flushed resulting in at least a memory leak, if not a data corruption. Signed-off-by: Alexander Duyck Acked-by: Johannes Berg Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ net/mac80211/tx.c | 15 ++++----------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c9da77a95e2d..c8259ac38745 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3628,9 +3628,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); + err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); + + sock_put(sk); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 925c39f4099e..cf7141452b0f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2072,30 +2072,23 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (unlikely(!multicast && skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { - struct sk_buff *orig_skb = skb; + struct sk_buff *ack_skb = skb_clone_sk(skb); - skb = skb_clone(skb, GFP_ATOMIC); - if (skb) { + if (ack_skb) { unsigned long flags; int id; spin_lock_irqsave(&local->ack_status_lock, flags); - id = idr_alloc(&local->ack_status_frames, orig_skb, + id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x10000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { info_id = id; info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - } else if (skb_shared(skb)) { - kfree_skb(orig_skb); } else { - kfree_skb(skb); - skb = orig_skb; + kfree_skb(ack_skb); } - } else { - /* couldn't clone -- lose tx status ... */ - skb = orig_skb; } } -- cgit v1.2.3 From 2d8f7e2c8a63131828e8d4e2d98835399f27319e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 10 Sep 2014 21:23:18 -0500 Subject: udp: Fix inverted NAPI_GRO_CB(skb)->flush test Commit 2abb7cdc0d ("udp: Add support for doing checksum unnecessary conversion") caused napi_gro_cb structs with the "flush" field zero to take the "udp_gro_receive" path rather than the "set flush to 1" path that they would previously take. As a result I saw booting from an NFS root hang shortly after starting userspace, with "server not responding" messages. This change to the handling of "flush == 0" packets appears to be incidental to the goal of adding new code in the case where skb_gro_checksum_validate_zero_check() returns zero. Based on that and the fact that it breaks things, I'm assuming that it is unintentional. Fixes: 2abb7cdc0d ("udp: Add support for doing checksum unnecessary conversion") Cc: Tom Herbert Signed-off-by: Scott Wood Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 52d5f46abf86..adab393b2fe5 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -294,7 +294,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ - if (!NAPI_GRO_CB(skb)->flush) + if (NAPI_GRO_CB(skb)->flush) goto skip; if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a1ad34b1c4ec..de85f809bf29 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -138,7 +138,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ - if (!NAPI_GRO_CB(skb)->flush) + if (NAPI_GRO_CB(skb)->flush) goto skip; if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, -- cgit v1.2.3 From 46e5da40aec256155cfedee96dd21a75da941f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:04:52 -0700 Subject: net: qdisc: use rcu prefix and silence sparse warnings Add __rcu notation to qdisc handling by doing this we can make smatch output more legible. And anyways some of the cases should be using rcu_dereference() see qdisc_all_tx_empty(), qdisc_tx_chainging(), and so on. Also *wake_queue() API is commonly called from driver timer routines without rcu lock or rtnl lock. So I added rcu_read_lock() blocks around netif_wake_subqueue and netif_tx_wake_queue. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 ++++----------------------- include/net/sch_generic.h | 21 +++++++++++++------ net/core/dev.c | 51 +++++++++++++++++++++++++++++++++++++++++++++-- net/sched/sch_generic.c | 4 ++-- net/sched/sch_mqprio.c | 6 ++++-- net/sched/sch_teql.c | 13 +++++++----- 6 files changed, 82 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ba72f6baae1a..ae721f53739e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -543,7 +543,7 @@ struct netdev_queue { * read mostly part */ struct net_device *dev; - struct Qdisc *qdisc; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; @@ -2356,12 +2356,7 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); void __netif_schedule(struct Qdisc *q); - -static inline void netif_schedule_queue(struct netdev_queue *txq) -{ - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) - __netif_schedule(txq->qdisc); -} +void netif_schedule_queue(struct netdev_queue *txq); static inline void netif_tx_schedule_all(struct net_device *dev) { @@ -2397,11 +2392,7 @@ static inline void netif_tx_start_all_queues(struct net_device *dev) } } -static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue) -{ - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) - __netif_schedule(dev_queue->qdisc); -} +void netif_tx_wake_queue(struct netdev_queue *dev_queue); /** * netif_wake_queue - restart transmit @@ -2673,19 +2664,7 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev, return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb)); } -/** - * netif_wake_subqueue - allow sending packets on subqueue - * @dev: network device - * @queue_index: sub queue index - * - * Resume individual transmit queue of a device with multiple transmit queues. - */ -static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) -{ - struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) - __netif_schedule(txq->qdisc); -} +void netif_wake_subqueue(struct net_device *dev, u16 queue_index); #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a3cfb8ebeb53..56838ab29b42 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -259,7 +259,9 @@ static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { - return qdisc->dev_queue->qdisc; + struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); + + return q; } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) @@ -384,7 +386,7 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); @@ -402,13 +404,18 @@ static inline void qdisc_reset_all_tx(struct net_device *dev) static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; + + rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - const struct Qdisc *q = txq->qdisc; + const struct Qdisc *q = rcu_dereference(txq->qdisc); - if (q->q.qlen) + if (q->q.qlen) { + rcu_read_unlock(); return false; + } } + rcu_read_unlock(); return true; } @@ -416,9 +423,10 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev) static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; + for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (txq->qdisc != txq->qdisc_sleeping) + if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping) return true; } return false; @@ -428,9 +436,10 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; + for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (txq->qdisc != &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; diff --git a/net/core/dev.c b/net/core/dev.c index 3c6a967e5830..b3d6dbc0c696 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) return (struct dev_kfree_skb_cb *)skb->cb; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +/** + * netif_wake_subqueue - allow sending packets on subqueue + * @dev: network device + * @queue_index: sub queue index + * + * Resume individual transmit queue of a device with multiple transmit queues. + */ +void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); + + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(txq->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_wake_subqueue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; @@ -3432,7 +3479,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - q = rxq->qdisc; + q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) @@ -3449,7 +3496,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, { struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - if (!rxq || rxq->qdisc == &noop_qdisc) + if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) goto out; if (*pt_prev) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 19696ebe9ebc..346ef85617d3 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -783,7 +783,7 @@ static void dev_deactivate_queue(struct net_device *dev, struct Qdisc *qdisc_default = _qdisc_default; struct Qdisc *qdisc; - qdisc = dev_queue->qdisc; + qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); @@ -876,7 +876,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, { struct Qdisc *qdisc = _qdisc; - dev_queue->qdisc = qdisc; + rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6749e2f540d0..37e7d25d21f1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -231,7 +231,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&sch->qstats, 0, sizeof(sch->qstats)); for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); spin_lock_bh(qdisc_lock(qdisc)); sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; @@ -340,7 +340,9 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + struct netdev_queue *q = netdev_get_tx_queue(dev, i); + + qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index aaa8d03ed054..5cd291bd00e4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -96,11 +96,14 @@ teql_dequeue(struct Qdisc *sch) struct teql_sched_data *dat = qdisc_priv(sch); struct netdev_queue *dat_queue; struct sk_buff *skb; + struct Qdisc *q; skb = __skb_dequeue(&dat->q); dat_queue = netdev_get_tx_queue(dat->m->dev, 0); + q = rcu_dereference_bh(dat_queue->qdisc); + if (skb == NULL) { - struct net_device *m = qdisc_dev(dat_queue->qdisc); + struct net_device *m = qdisc_dev(q); if (m) { dat->m->slaves = sch; netif_wake_queue(m); @@ -108,7 +111,7 @@ teql_dequeue(struct Qdisc *sch) } else { qdisc_bstats_update(sch, skb); } - sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; + sch->q.qlen = dat->q.qlen + q->q.qlen; return skb; } @@ -157,9 +160,9 @@ teql_destroy(struct Qdisc *sch) txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; - root_lock = qdisc_root_sleeping_lock(txq->qdisc); + root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc)); spin_lock_bh(root_lock); - qdisc_reset(txq->qdisc); + qdisc_reset(rtnl_dereference(txq->qdisc)); spin_unlock_bh(root_lock); } } @@ -266,7 +269,7 @@ static inline int teql_resolve(struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); int res; - if (txq->qdisc == &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) == &noop_qdisc) return -ENODEV; if (!dev->header_ops || !dst) -- cgit v1.2.3 From 25d8c0d55f241ce2d360df1bea48e23a55836ee6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:05:27 -0700 Subject: net: rcu-ify tcf_proto rcu'ify tcf_proto this allows calling tc_classify() without holding any locks. Updaters are protected by RTNL. This patch prepares the core net_sched infrastracture for running the classifier/action chains without holding the qdisc lock however it does nothing to ensure cls_xxx and act_xxx types also work without locking. Additional patches are required to address the fall out. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 +++++---- net/sched/cls_api.c | 30 +++++++++++++++--------------- net/sched/sch_api.c | 10 +++++----- net/sched/sch_atm.c | 20 +++++++++++--------- net/sched/sch_cbq.c | 11 +++++++---- net/sched/sch_choke.c | 15 +++++++++------ net/sched/sch_drr.c | 9 ++++++--- net/sched/sch_dsmark.c | 9 +++++---- net/sched/sch_fq_codel.c | 11 +++++++---- net/sched/sch_hfsc.c | 8 ++++---- net/sched/sch_htb.c | 15 ++++++++------- net/sched/sch_ingress.c | 8 +++++--- net/sched/sch_multiq.c | 8 +++++--- net/sched/sch_prio.c | 11 +++++++---- net/sched/sch_qfq.c | 9 ++++++--- net/sched/sch_sfb.c | 15 +++++++++------ net/sched/sch_sfq.c | 11 +++++++---- 17 files changed, 121 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 56838ab29b42..1e89b9ad3a4c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -143,7 +143,7 @@ struct Qdisc_class_ops { void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ - struct tcf_proto ** (*tcf_chain)(struct Qdisc *, unsigned long); + struct tcf_proto __rcu ** (*tcf_chain)(struct Qdisc *, unsigned long); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); @@ -212,8 +212,8 @@ struct tcf_proto_ops { struct tcf_proto { /* Fast access part */ - struct tcf_proto *next; - void *root; + struct tcf_proto __rcu *next; + void __rcu *root; int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); @@ -225,6 +225,7 @@ struct tcf_proto { struct Qdisc *q; void *data; const struct tcf_proto_ops *ops; + struct rcu_head rcu; }; struct qdisc_skb_cb { @@ -378,7 +379,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); void tcf_destroy(struct tcf_proto *tp); -void tcf_destroy_chain(struct tcf_proto **fl); +void tcf_destroy_chain(struct tcf_proto __rcu **fl); /* Reset all TX qdiscs greater then index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c28b0d327b12..e547efdaba9d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -117,7 +117,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; - spinlock_t *root_lock; struct tcmsg *t; u32 protocol; u32 prio; @@ -125,7 +124,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) u32 parent; struct net_device *dev; struct Qdisc *q; - struct tcf_proto **back, **chain; + struct tcf_proto __rcu **back; + struct tcf_proto __rcu **chain; struct tcf_proto *tp; const struct tcf_proto_ops *tp_ops; const struct Qdisc_class_ops *cops; @@ -197,7 +197,9 @@ replay: goto errout; /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; (tp = *back) != NULL; back = &tp->next) { + for (back = chain; + (tp = rtnl_dereference(*back)) != NULL; + back = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { if (!nprio || @@ -209,8 +211,6 @@ replay: } } - root_lock = qdisc_root_sleeping_lock(q); - if (tp == NULL) { /* Proto-tcf does not exist, create new one */ @@ -259,7 +259,8 @@ replay: } tp->ops = tp_ops; tp->protocol = protocol; - tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back)); + tp->prio = nprio ? : + TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); tp->q = q; tp->classify = tp_ops->classify; tp->classid = parent; @@ -280,9 +281,9 @@ replay: if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - spin_lock_bh(root_lock); - *back = tp->next; - spin_unlock_bh(root_lock); + struct tcf_proto *next = rtnl_dereference(tp->next); + + RCU_INIT_POINTER(*back, next); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); tcf_destroy(tp); @@ -322,10 +323,8 @@ replay: n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { if (tp_created) { - spin_lock_bh(root_lock); - tp->next = *back; - *back = tp; - spin_unlock_bh(root_lock); + RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); + rcu_assign_pointer(*back, tp); } tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); } else { @@ -420,7 +419,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) int s_t; struct net_device *dev; struct Qdisc *q; - struct tcf_proto *tp, **chain; + struct tcf_proto *tp, __rcu **chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; @@ -454,7 +453,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tp = *chain, t = 0; tp; tp = tp->next, t++) { + for (tp = rtnl_dereference(*chain), t = 0; + tp; tp = rtnl_dereference(tp->next), t++) { if (t < s_t) continue; if (TC_H_MAJ(tcm->tcm_info) && diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 58bed7599db7..ca6248345937 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1781,7 +1781,7 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, __be16 protocol = skb->protocol; int err; - for (; tp; tp = tp->next) { + for (; tp; tp = rcu_dereference_bh(tp->next)) { if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; @@ -1833,15 +1833,15 @@ void tcf_destroy(struct tcf_proto *tp) { tp->ops->destroy(tp); module_put(tp->ops->owner); - kfree(tp); + kfree_rcu(tp, rcu); } -void tcf_destroy_chain(struct tcf_proto **fl) +void tcf_destroy_chain(struct tcf_proto __rcu **fl) { struct tcf_proto *tp; - while ((tp = *fl) != NULL) { - *fl = tp->next; + while ((tp = rtnl_dereference(*fl)) != NULL) { + RCU_INIT_POINTER(*fl, tp->next); tcf_destroy(tp); } } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 8449b337f9e3..c398f9c3dbdd 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -41,7 +41,7 @@ struct atm_flow_data { struct Qdisc *q; /* FIFO, TBF, etc. */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); /* chaining */ @@ -273,7 +273,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, error = -ENOBUFS; goto err_out; } - flow->filter_list = NULL; + RCU_INIT_POINTER(flow->filter_list, NULL); flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!flow->q) flow->q = &noop_qdisc; @@ -311,7 +311,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); if (list_empty(&flow->list)) return -EINVAL; - if (flow->filter_list || flow == &p->link) + if (rcu_access_pointer(flow->filter_list) || flow == &p->link) return -EBUSY; /* * Reference count must be 2: one for "keepalive" (set at class @@ -345,7 +345,8 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; @@ -369,11 +370,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow = NULL; if (TC_H_MAJ(skb->priority) != sch->handle || !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { + struct tcf_proto *fl; + list_for_each_entry(flow, &p->flows, list) { - if (flow->filter_list) { - result = tc_classify_compat(skb, - flow->filter_list, - &res); + fl = rcu_dereference_bh(flow->filter_list); + if (fl) { + result = tc_classify_compat(skb, fl, &res); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; @@ -544,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - p->link.filter_list = NULL; + RCU_INIT_POINTER(p->link.filter_list, NULL); p->link.vcc = NULL; p->link.sock = NULL; p->link.classid = sch->handle; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 762a04bb8f6d..a3244a800501 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -133,7 +133,7 @@ struct cbq_class { struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; int refcnt; int filters; @@ -221,6 +221,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct cbq_class **defmap; struct cbq_class *cl = NULL; u32 prio = skb->priority; + struct tcf_proto *fl; struct tcf_result res; /* @@ -235,11 +236,12 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int result = 0; defmap = head->defaults; + fl = rcu_dereference_bh(head->filter_list); /* * Step 2+n. Apply classifier. */ - if (!head->filter_list || - (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) + result = tc_classify_compat(skb, fl, &res); + if (!fl || result < 0) goto fallback; cl = (void *)res.class; @@ -1954,7 +1956,8 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, + unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ed30e436128b..74813e6b6ff6 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -57,7 +57,7 @@ struct choke_sched_data { /* Variables */ struct red_vars vars; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ @@ -193,9 +193,11 @@ static bool choke_classify(struct sk_buff *skb, { struct choke_sched_data *q = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl; int result; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -249,7 +251,7 @@ static bool choke_match_random(const struct choke_sched_data *q, return false; oskb = choke_peek_random(q, pidx); - if (q->filter_list) + if (rcu_access_pointer(q->filter_list)) return choke_get_classid(nskb) == choke_get_classid(oskb); return choke_match_flow(oskb, nskb); @@ -257,11 +259,11 @@ static bool choke_match_random(const struct choke_sched_data *q, static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) { + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); const struct red_parms *p = &q->parms; - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - if (q->filter_list) { + if (rcu_access_pointer(q->filter_list)) { /* If using external classifiers, get result and record it. */ if (!choke_classify(skb, sch, &ret)) goto other_drop; /* Packet was eaten by filter */ @@ -554,7 +556,8 @@ static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, return 0; } -static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct choke_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 7bbbfe112192..d8b5ccfd248a 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,7 +35,7 @@ struct drr_class { struct drr_sched { struct list_head active; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc_class_hash clhash; }; @@ -184,7 +184,8 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg) drr_destroy_class(sch, cl); } -static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch, + unsigned long cl) { struct drr_sched *q = qdisc_priv(sch); @@ -319,6 +320,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { @@ -328,7 +330,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 49d6ef338b55..485e456c8139 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -37,7 +37,7 @@ struct dsmark_qdisc_data { struct Qdisc *q; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; u8 *mask; /* "owns" the array */ u8 *value; u16 indices; @@ -186,8 +186,8 @@ ignore: } } -static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch, - unsigned long cl) +static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct dsmark_qdisc_data *p = qdisc_priv(sch); return &p->filter_list; @@ -229,7 +229,8 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = TC_H_MIN(skb->priority); else { struct tcf_result res; - int result = tc_classify(skb, p->filter_list, &res); + struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); + int result = tc_classify(skb, fl, &res); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index cc56c8bb9bed..105cf5557630 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -52,7 +52,7 @@ struct fq_codel_flow { }; /* please try to keep this structure <= 64 bytes */ struct fq_codel_sched_data { - struct tcf_proto *filter_list; /* optional external classifier */ + struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ @@ -85,6 +85,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; struct tcf_result res; int result; @@ -93,11 +94,12 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->flows_cnt) return TC_H_MIN(skb->priority); - if (!q->filter_list) + filter = rcu_dereference(q->filter_list); + if (!filter) return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, filter, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -496,7 +498,8 @@ static void fq_codel_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct fq_codel_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ec8aeaac1dd7..04b0de4c68b5 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,7 @@ struct hfsc_class { struct gnet_stats_queue qstats; struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ - struct tcf_proto *filter_list; /* filter list */ + struct tcf_proto __rcu *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ struct hfsc_sched *sched; /* scheduler data */ @@ -1161,7 +1161,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; - tcf = q->root.filter_list; + tcf = rcu_dereference_bh(q->root.filter_list); while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1185,7 +1185,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return cl; /* hit leaf class */ /* apply inner filter chain */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); head = cl; } @@ -1285,7 +1285,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) cl->filter_cnt--; } -static struct tcf_proto ** +static struct tcf_proto __rcu ** hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) { struct hfsc_sched *q = qdisc_priv(sch); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index aea942ce6008..6d16b9b81c28 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -103,7 +103,7 @@ struct htb_class { u32 prio; /* these two are used only by leaves... */ int quantum; /* but stored for parent-to-leaf return */ - struct tcf_proto *filter_list; /* class attached filters */ + struct tcf_proto __rcu *filter_list; /* class attached filters */ int filter_cnt; int refcnt; /* usage count of this class */ @@ -153,7 +153,7 @@ struct htb_sched { int rate2quantum; /* quant = rate / rate2quantum */ /* filters for qdisc itself */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; #define HTB_WARN_TOOMANYEVENTS 0x1 unsigned int warned; /* only one warning */ @@ -223,9 +223,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, if (cl->level == 0) return cl; /* Start with inner filter chain if a non-leaf class is selected */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); } else { - tcf = q->filter_list; + tcf = rcu_dereference_bh(q->filter_list); } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; @@ -251,7 +251,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, return cl; /* we hit leaf; return it */ /* we have got inner class; apply inner filter chain */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); } /* classification failed; try to use default class */ cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); @@ -1519,11 +1519,12 @@ failure: return err; } -static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch, + unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list; return fl; } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 62871c14e1f9..b351125f3849 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -17,7 +17,7 @@ struct ingress_qdisc_data { - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; }; /* ------------------------- Class/flow operations ------------------------- */ @@ -46,7 +46,8 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } -static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct ingress_qdisc_data *p = qdisc_priv(sch); @@ -59,9 +60,10 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct ingress_qdisc_data *p = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); int result; - result = tc_classify(skb, p->filter_list, &res); + result = tc_classify(skb, fl, &res); qdisc_bstats_update(sch, skb); switch (result) { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index afb050a735fa..c0466c1840f3 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -31,7 +31,7 @@ struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc **queues; }; @@ -42,10 +42,11 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; + struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, q->filter_list, &res); + err = tc_classify(skb, fl, &res); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -388,7 +389,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct multiq_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 79359b69ad8d..03ef99e52a5c 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -24,7 +24,7 @@ struct prio_sched_data { int bands; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; @@ -36,11 +36,13 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; + struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { - err = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + err = tc_classify(skb, fl, &res); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -50,7 +52,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return NULL; } #endif - if (!q->filter_list || err < 0) { + if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; @@ -351,7 +353,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8056fb4e618a..602ea01a4ddd 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -181,7 +181,7 @@ struct qfq_group { }; struct qfq_sched { - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ @@ -576,7 +576,8 @@ static void qfq_put_class(struct Qdisc *sch, unsigned long arg) qfq_destroy_class(sch, cl); } -static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch, + unsigned long cl) { struct qfq_sched *q = qdisc_priv(sch); @@ -704,6 +705,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { @@ -714,7 +716,8 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 9b0f7093d970..1562fb2b3f46 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -55,7 +55,7 @@ struct sfb_bins { struct sfb_sched_data { struct Qdisc *qdisc; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; @@ -253,13 +253,13 @@ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) return false; } -static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, +static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, int *qerr, u32 *salt) { struct tcf_result res; int result; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; + struct tcf_proto *fl; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -306,9 +307,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - if (q->filter_list) { + fl = rcu_dereference_bh(q->filter_list); + if (fl) { /* If using external classifiers, get result and record it. */ - if (!sfb_classify(skb, q, &ret, &salt)) + if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; keys.src = salt; keys.dst = 0; @@ -660,7 +662,8 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct sfb_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 211db9017c35..80c36bd54abc 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -125,7 +125,7 @@ struct sfq_sched_data { u8 cur_depth; /* depth of longest slot */ u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ @@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, { struct sfq_sched_data *q = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority) == sch->handle && @@ -194,13 +195,14 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); - if (!q->filter_list) { + fl = rcu_dereference_bh(q->filter_list); + if (!fl) { skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); return sfq_hash(q, skb) + 1; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -836,7 +838,8 @@ static void sfq_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); -- cgit v1.2.3 From 9888faefe1327909f3acf34d1feda87a368bb858 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:05:59 -0700 Subject: net: sched: cls_basic use RCU Enable basic classifier for RCU. Dereferencing tp->root may look a bit strange here but it is needed by my accounting because it is allocated at init time and needs to be kfree'd at destroy time. However because it may be referenced in the classify() path we must wait an RCU grace period before free'ing it. We use kfree_rcu() and rcu_ APIs to enforce this. This pattern is used in all the classifiers. Also the hgenerator can be incremented without concern because it is always incremented under RTNL. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 80 +++++++++++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 0ae1813e3e90..1937298d6775 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -24,6 +24,7 @@ struct basic_head { u32 hgenerator; struct list_head flist; + struct rcu_head rcu; }; struct basic_filter { @@ -31,17 +32,19 @@ struct basic_filter { struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_result res; + struct tcf_proto *tp; struct list_head link; + struct rcu_head rcu; }; static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int r; - struct basic_head *head = tp->root; + struct basic_head *head = rcu_dereference_bh(tp->root); struct basic_filter *f; - list_for_each_entry(f, &head->flist, link) { + list_for_each_entry_rcu(f, &head->flist, link) { if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; *res = f->res; @@ -56,7 +59,7 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long basic_get(struct tcf_proto *tp, u32 handle) { unsigned long l = 0UL; - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; if (head == NULL) @@ -81,12 +84,15 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); - tp->root = head; + rcu_assign_pointer(tp->root, head); return 0; } -static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f) +static void basic_delete_filter(struct rcu_head *head) { + struct basic_filter *f = container_of(head, struct basic_filter, rcu); + struct tcf_proto *tp = f->tp; + tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); tcf_em_tree_destroy(tp, &f->ematches); @@ -95,27 +101,26 @@ static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f) static void basic_destroy(struct tcf_proto *tp) { - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; list_for_each_entry_safe(f, n, &head->flist, link) { - list_del(&f->link); - basic_delete_filter(tp, f); + list_del_rcu(&f->link); + call_rcu(&f->rcu, basic_delete_filter); } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static int basic_delete(struct tcf_proto *tp, unsigned long arg) { - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *t, *f = (struct basic_filter *) arg; list_for_each_entry(t, &head->flist, link) if (t == f) { - tcf_tree_lock(tp); - list_del(&t->link); - tcf_tree_unlock(tp); - basic_delete_filter(tp, t); + list_del_rcu(&t->link); + call_rcu(&t->rcu, basic_delete_filter); return 0; } @@ -152,6 +157,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &f->exts, &e); tcf_em_tree_change(tp, &f->ematches, &t); + f->tp = tp; return 0; errout: @@ -164,9 +170,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { int err; - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_BASIC_MAX + 1]; - struct basic_filter *f = (struct basic_filter *) *arg; + struct basic_filter *fold = (struct basic_filter *) *arg; + struct basic_filter *fnew; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -176,22 +183,23 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - if (f != NULL) { - if (handle && f->handle != handle) + if (fold != NULL) { + if (handle && fold->handle != handle) return -EINVAL; - return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); } err = -ENOBUFS; - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (f == NULL) + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (fnew == NULL) goto errout; - tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; - if (handle) - f->handle = handle; - else { + if (handle) { + fnew->handle = handle; + } else if (fold) { + fnew->handle = fold->handle; + } else { unsigned int i = 0x80000000; do { if (++head->hgenerator == 0x7FFFFFFF) @@ -203,29 +211,31 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, goto errout; } - f->handle = head->hgenerator; + fnew->handle = head->hgenerator; } - err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); + err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); if (err < 0) goto errout; - tcf_tree_lock(tp); - list_add(&f->link, &head->flist); - tcf_tree_unlock(tp); - *arg = (unsigned long) f; + *arg = (unsigned long)fnew; + + if (fold) { + list_replace_rcu(&fold->link, &fnew->link); + call_rcu(&fold->rcu, basic_delete_filter); + } else { + list_add_rcu(&fnew->link, &head->flist); + } return 0; errout: - if (*arg == 0UL && f) - kfree(f); - + kfree(fnew); return err; } static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { -- cgit v1.2.3 From 952313bd62589cae216a579bb7ebc76f8e290817 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:06:26 -0700 Subject: net: sched: cls_cgroup use RCU Make cgroup classifier safe for RCU. Also drops the calls in the classify routine that were doing a rcu_read_lock()/rcu_read_unlock(). If the rcu_read_lock() isn't held entering this routine we have issues with deleting the classifier chain so remove the unnecessary rcu_read_lock()/rcu_read_unlock() pair noting all paths AFAIK hold rcu_read_lock. If there is a case where classify is called without the rcu read lock then an rcu splat will occur and we can correct it. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 63 +++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index cacf01bd04f0..3b7548759998 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -22,17 +22,17 @@ struct cls_cgroup_head { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; + struct tcf_proto *tp; + struct rcu_head rcu; }; static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); u32 classid; - rcu_read_lock(); classid = task_cls_state(current)->classid; - rcu_read_unlock(); /* * Due to the nature of the classifier it is required to ignore all @@ -80,13 +80,25 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; +static void cls_cgroup_destroy_rcu(struct rcu_head *root) +{ + struct cls_cgroup_head *head = container_of(root, + struct cls_cgroup_head, + rcu); + + tcf_exts_destroy(head->tp, &head->exts); + tcf_em_tree_destroy(head->tp, &head->ematches); + kfree(head); +} + static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg, bool ovr) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); + struct cls_cgroup_head *new; struct tcf_ematch_tree t; struct tcf_exts e; int err; @@ -94,25 +106,24 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!tca[TCA_OPTIONS]) return -EINVAL; - if (head == NULL) { - if (!handle) - return -EINVAL; + if (!head && !handle) + return -EINVAL; - head = kzalloc(sizeof(*head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; + if (head && handle != head->handle) + return -ENOENT; - tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); - head->handle = handle; + new = kzalloc(sizeof(*head), GFP_KERNEL); + if (!new) + return -ENOBUFS; - tcf_tree_lock(tp); - tp->root = head; - tcf_tree_unlock(tp); + if (head) { + new->handle = head->handle; + } else { + tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + new->handle = handle; } - if (handle != head->handle) - return -ENOENT; - + new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], cgroup_policy); if (err < 0) @@ -127,20 +138,24 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - tcf_exts_change(tp, &head->exts, &e); - tcf_em_tree_change(tp, &head->ematches, &t); + tcf_exts_change(tp, &new->exts, &e); + tcf_em_tree_change(tp, &new->ematches, &t); + rcu_assign_pointer(tp->root, new); + if (head) + call_rcu(&head->rcu, cls_cgroup_destroy_rcu); return 0; } static void cls_cgroup_destroy(struct tcf_proto *tp) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (head) { tcf_exts_destroy(tp, &head->exts); tcf_em_tree_destroy(tp, &head->ematches); - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } } @@ -151,7 +166,7 @@ static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg) static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (arg->count < arg->skip) goto skip; @@ -167,7 +182,7 @@ skip: static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; -- cgit v1.2.3 From 70da9f0bf999627e50950f6845bd3819ff811085 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:06:55 -0700 Subject: net: sched: cls_flow use RCU Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_flow.c | 145 +++++++++++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 35be16f7c192..95736fa479f3 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -34,12 +34,14 @@ struct flow_head { struct list_head filters; + struct rcu_head rcu; }; struct flow_filter { struct list_head list; struct tcf_exts exts; struct tcf_ematch_tree ematches; + struct tcf_proto *tp; struct timer_list perturb_timer; u32 perturb_period; u32 handle; @@ -54,6 +56,7 @@ struct flow_filter { u32 divisor; u32 baseclass; u32 hashrnd; + struct rcu_head rcu; }; static inline u32 addr_fold(void *addr) @@ -276,14 +279,14 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct flow_head *head = tp->root; + struct flow_head *head = rcu_dereference_bh(tp->root); struct flow_filter *f; u32 keymask; u32 classid; unsigned int n, key; int r; - list_for_each_entry(f, &head->filters, list) { + list_for_each_entry_rcu(f, &head->filters, list) { u32 keys[FLOW_KEY_MAX + 1]; struct flow_keys flow_keys; @@ -346,13 +349,23 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; +static void flow_destroy_filter(struct rcu_head *head) +{ + struct flow_filter *f = container_of(head, struct flow_filter, rcu); + + del_timer_sync(&f->perturb_timer); + tcf_exts_destroy(f->tp, &f->exts); + tcf_em_tree_destroy(f->tp, &f->ematches); + kfree(f); +} + static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct flow_head *head = tp->root; - struct flow_filter *f; + struct flow_head *head = rtnl_dereference(tp->root); + struct flow_filter *fold, *fnew; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FLOW_MAX + 1]; struct tcf_exts e; @@ -401,20 +414,42 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err1; - f = (struct flow_filter *)*arg; - if (f != NULL) { + err = -ENOBUFS; + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + goto err2; + + fold = (struct flow_filter *)*arg; + if (fold) { err = -EINVAL; - if (f->handle != handle && handle) + if (fold->handle != handle && handle) goto err2; - mode = f->mode; + /* Copy fold into fnew */ + fnew->handle = fold->handle; + fnew->keymask = fold->keymask; + fnew->tp = fold->tp; + + fnew->handle = fold->handle; + fnew->nkeys = fold->nkeys; + fnew->keymask = fold->keymask; + fnew->mode = fold->mode; + fnew->mask = fold->mask; + fnew->xor = fold->xor; + fnew->rshift = fold->rshift; + fnew->addend = fold->addend; + fnew->divisor = fold->divisor; + fnew->baseclass = fold->baseclass; + fnew->hashrnd = fold->hashrnd; + + mode = fold->mode; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (mode == FLOW_MODE_HASH) - perturb_period = f->perturb_period; + perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; @@ -444,83 +479,70 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); - err = -ENOBUFS; - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (f == NULL) - goto err2; - - f->handle = handle; - f->mask = ~0U; - tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); - - get_random_bytes(&f->hashrnd, 4); - f->perturb_timer.function = flow_perturbation; - f->perturb_timer.data = (unsigned long)f; - init_timer_deferrable(&f->perturb_timer); + fnew->handle = handle; + fnew->mask = ~0U; + fnew->tp = tp; + get_random_bytes(&fnew->hashrnd, 4); + tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); } - tcf_exts_change(tp, &f->exts, &e); - tcf_em_tree_change(tp, &f->ematches, &t); + fnew->perturb_timer.function = flow_perturbation; + fnew->perturb_timer.data = (unsigned long)fnew; + init_timer_deferrable(&fnew->perturb_timer); - tcf_tree_lock(tp); + tcf_exts_change(tp, &fnew->exts, &e); + tcf_em_tree_change(tp, &fnew->ematches, &t); if (tb[TCA_FLOW_KEYS]) { - f->keymask = keymask; - f->nkeys = nkeys; + fnew->keymask = keymask; + fnew->nkeys = nkeys; } - f->mode = mode; + fnew->mode = mode; if (tb[TCA_FLOW_MASK]) - f->mask = nla_get_u32(tb[TCA_FLOW_MASK]); + fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]); if (tb[TCA_FLOW_XOR]) - f->xor = nla_get_u32(tb[TCA_FLOW_XOR]); + fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]); if (tb[TCA_FLOW_RSHIFT]) - f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); + fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); if (tb[TCA_FLOW_ADDEND]) - f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); + fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); if (tb[TCA_FLOW_DIVISOR]) - f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); + fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); if (baseclass) - f->baseclass = baseclass; + fnew->baseclass = baseclass; - f->perturb_period = perturb_period; - del_timer(&f->perturb_timer); + fnew->perturb_period = perturb_period; if (perturb_period) - mod_timer(&f->perturb_timer, jiffies + perturb_period); + mod_timer(&fnew->perturb_timer, jiffies + perturb_period); if (*arg == 0) - list_add_tail(&f->list, &head->filters); + list_add_tail_rcu(&fnew->list, &head->filters); + else + list_replace_rcu(&fnew->list, &fold->list); - tcf_tree_unlock(tp); + *arg = (unsigned long)fnew; - *arg = (unsigned long)f; + if (fold) + call_rcu(&fold->rcu, flow_destroy_filter); return 0; err2: tcf_em_tree_destroy(tp, &t); + kfree(fnew); err1: tcf_exts_destroy(tp, &e); return err; } -static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f) -{ - del_timer_sync(&f->perturb_timer); - tcf_exts_destroy(tp, &f->exts); - tcf_em_tree_destroy(tp, &f->ematches); - kfree(f); -} - static int flow_delete(struct tcf_proto *tp, unsigned long arg) { struct flow_filter *f = (struct flow_filter *)arg; - tcf_tree_lock(tp); - list_del(&f->list); - tcf_tree_unlock(tp); - flow_destroy_filter(tp, f); + list_del_rcu(&f->list); + call_rcu(&f->rcu, flow_destroy_filter); return 0; } @@ -532,28 +554,29 @@ static int flow_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->filters); - tp->root = head; + rcu_assign_pointer(tp->root, head); return 0; } static void flow_destroy(struct tcf_proto *tp) { - struct flow_head *head = tp->root; + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) { - list_del(&f->list); - flow_destroy_filter(tp, f); + list_del_rcu(&f->list); + call_rcu(&f->rcu, flow_destroy_filter); } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static unsigned long flow_get(struct tcf_proto *tp, u32 handle) { - struct flow_head *head = tp->root; + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; - list_for_each_entry(f, &head->filters, list) + list_for_each_entry_rcu(f, &head->filters, list) if (f->handle == handle) return (unsigned long)f; return 0; @@ -626,10 +649,10 @@ nla_put_failure: static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct flow_head *head = tp->root; + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; - list_for_each_entry(f, &head->filters, list) { + list_for_each_entry_rcu(f, &head->filters, list) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long)f, arg) < 0) { -- cgit v1.2.3 From e35a8ee5993ba81fd6c092f6827458c60406255b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:07:22 -0700 Subject: net: sched: fw use RCU RCU'ify fw classifier. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_fw.c | 111 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 861b03ccfed0..006b45a67fdd 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,17 +33,20 @@ struct fw_head { u32 mask; - struct fw_filter *ht[HTSIZE]; + struct fw_filter __rcu *ht[HTSIZE]; + struct rcu_head rcu; }; struct fw_filter { - struct fw_filter *next; + struct fw_filter __rcu *next; u32 id; struct tcf_result res; #ifdef CONFIG_NET_CLS_IND int ifindex; #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; + struct tcf_proto *tp; + struct rcu_head rcu; }; static u32 fw_hash(u32 handle) @@ -56,14 +59,16 @@ static u32 fw_hash(u32 handle) static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct fw_head *head = tp->root; + struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; int r; u32 id = skb->mark; if (head != NULL) { id &= head->mask; - for (f = head->ht[fw_hash(id)]; f; f = f->next) { + + for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f; + f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND @@ -92,13 +97,14 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long fw_get(struct tcf_proto *tp, u32 handle) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; if (head == NULL) return 0; - for (f = head->ht[fw_hash(handle)]; f; f = f->next) { + f = rtnl_dereference(head->ht[fw_hash(handle)]); + for (; f; f = rtnl_dereference(f->next)) { if (f->id == handle) return (unsigned long)f; } @@ -114,8 +120,11 @@ static int fw_init(struct tcf_proto *tp) return 0; } -static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +static void fw_delete_filter(struct rcu_head *head) { + struct fw_filter *f = container_of(head, struct fw_filter, rcu); + struct tcf_proto *tp = f->tp; + tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); kfree(f); @@ -123,7 +132,7 @@ static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) static void fw_destroy(struct tcf_proto *tp) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; int h; @@ -131,29 +140,33 @@ static void fw_destroy(struct tcf_proto *tp) return; for (h = 0; h < HTSIZE; h++) { - while ((f = head->ht[h]) != NULL) { - head->ht[h] = f->next; - fw_delete_filter(tp, f); + while ((f = rtnl_dereference(head->ht[h])) != NULL) { + RCU_INIT_POINTER(head->ht[h], + rtnl_dereference(f->next)); + call_rcu(&f->rcu, fw_delete_filter); } } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static int fw_delete(struct tcf_proto *tp, unsigned long arg) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *)arg; - struct fw_filter **fp; + struct fw_filter __rcu **fp; + struct fw_filter *pfp; if (head == NULL || f == NULL) goto out; - for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { - if (*fp == f) { - tcf_tree_lock(tp); - *fp = f->next; - tcf_tree_unlock(tp); - fw_delete_filter(tp, f); + fp = &head->ht[fw_hash(f->id)]; + + for (pfp = rtnl_dereference(*fp); pfp; + fp = &pfp->next, pfp = rtnl_dereference(*fp)) { + if (pfp == f) { + RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); + call_rcu(&f->rcu, fw_delete_filter); return 0; } } @@ -171,7 +184,7 @@ static int fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct tcf_exts e; u32 mask; int err; @@ -220,7 +233,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *) *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; @@ -233,10 +246,42 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - if (f != NULL) { + if (f) { + struct fw_filter *pfp, *fnew; + struct fw_filter __rcu **fp; + if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(net, tp, f, tb, tca, base, ovr); + + fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + fnew->id = f->id; + fnew->res = f->res; +#ifdef CONFIG_NET_CLS_IND + fnew->ifindex = f->ifindex; +#endif /* CONFIG_NET_CLS_IND */ + fnew->tp = f->tp; + + err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); + if (err < 0) { + kfree(fnew); + return err; + } + + fp = &head->ht[fw_hash(fnew->id)]; + for (pfp = rtnl_dereference(*fp); pfp; + fp = &pfp->next, pfp = rtnl_dereference(*fp)) + if (pfp == f) + break; + + RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next)); + rcu_assign_pointer(*fp, fnew); + call_rcu(&f->rcu, fw_delete_filter); + + *arg = (unsigned long)fnew; + return err; } if (!handle) @@ -252,9 +297,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; head->mask = mask; - tcf_tree_lock(tp); - tp->root = head; - tcf_tree_unlock(tp); + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); @@ -263,15 +306,14 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); f->id = handle; + f->tp = tp; err = fw_change_attrs(net, tp, f, tb, tca, base, ovr); if (err < 0) goto errout; - f->next = head->ht[fw_hash(handle)]; - tcf_tree_lock(tp); - head->ht[fw_hash(handle)] = f; - tcf_tree_unlock(tp); + RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]); + rcu_assign_pointer(head->ht[fw_hash(handle)], f); *arg = (unsigned long)f; return 0; @@ -283,7 +325,7 @@ errout: static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); int h; if (head == NULL) @@ -295,7 +337,8 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) for (h = 0; h < HTSIZE; h++) { struct fw_filter *f; - for (f = head->ht[h]; f; f = f->next) { + for (f = rtnl_dereference(head->ht[h]); f; + f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; @@ -312,7 +355,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *)fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; -- cgit v1.2.3 From 1109c00547fc66df45b9ff923544be4c1e1bec13 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:07:50 -0700 Subject: net: sched: RCU cls_route RCUify the route classifier. For now however spinlock's are used to protect fastmap cache. The issue here is the fastmap may be read by one CPU while the cache is being updated by another. An array of pointers could be one possible solution. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_route.c | 226 +++++++++++++++++++++++++++++--------------------- 1 file changed, 132 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index dd9fc2523c76..ba96deacf27c 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -29,25 +29,26 @@ * are mutually exclusive. * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" */ - struct route4_fastmap { - struct route4_filter *filter; - u32 id; - int iif; + struct route4_filter *filter; + u32 id; + int iif; }; struct route4_head { - struct route4_fastmap fastmap[16]; - struct route4_bucket *table[256 + 1]; + struct route4_fastmap fastmap[16]; + struct route4_bucket __rcu *table[256 + 1]; + struct rcu_head rcu; }; struct route4_bucket { /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ - struct route4_filter *ht[16 + 16 + 1]; + struct route4_filter __rcu *ht[16 + 16 + 1]; + struct rcu_head rcu; }; struct route4_filter { - struct route4_filter *next; + struct route4_filter __rcu *next; u32 id; int iif; @@ -55,6 +56,8 @@ struct route4_filter { struct tcf_exts exts; u32 handle; struct route4_bucket *bkt; + struct tcf_proto *tp; + struct rcu_head rcu; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) @@ -64,14 +67,13 @@ static inline int route4_fastmap_hash(u32 id, int iif) return id & 0xF; } +static DEFINE_SPINLOCK(fastmap_lock); static void -route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) +route4_reset_fastmap(struct route4_head *head) { - spinlock_t *root_lock = qdisc_root_sleeping_lock(q); - - spin_lock_bh(root_lock); + spin_lock_bh(&fastmap_lock); memset(head->fastmap, 0, sizeof(head->fastmap)); - spin_unlock_bh(root_lock); + spin_unlock_bh(&fastmap_lock); } static void @@ -80,9 +82,12 @@ route4_set_fastmap(struct route4_head *head, u32 id, int iif, { int h = route4_fastmap_hash(id, iif); + /* fastmap updates must look atomic to aling id, iff, filter */ + spin_lock_bh(&fastmap_lock); head->fastmap[h].id = id; head->fastmap[h].iif = iif; head->fastmap[h].filter = f; + spin_unlock_bh(&fastmap_lock); } static inline int route4_hash_to(u32 id) @@ -123,7 +128,7 @@ static inline int route4_hash_wild(void) static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct route4_head *head = tp->root; + struct route4_head *head = rcu_dereference_bh(tp->root); struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; @@ -141,32 +146,43 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); + + spin_lock(&fastmap_lock); if (id == head->fastmap[h].id && iif == head->fastmap[h].iif && (f = head->fastmap[h].filter) != NULL) { - if (f == ROUTE4_FAILURE) + if (f == ROUTE4_FAILURE) { + spin_unlock(&fastmap_lock); goto failure; + } *res = f->res; + spin_unlock(&fastmap_lock); return 0; } + spin_unlock(&fastmap_lock); h = route4_hash_to(id); restart: - b = head->table[h]; + b = rcu_dereference_bh(head->table[h]); if (b) { - for (f = b->ht[route4_hash_from(id)]; f; f = f->next) + for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]); + f; + f = rcu_dereference_bh(f->next)) if (f->id == id) ROUTE4_APPLY_RESULT(); - for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) + for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]); + f; + f = rcu_dereference_bh(f->next)) if (f->iif == iif) ROUTE4_APPLY_RESULT(); - for (f = b->ht[route4_hash_wild()]; f; f = f->next) + for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]); + f; + f = rcu_dereference_bh(f->next)) ROUTE4_APPLY_RESULT(); - } if (h < 256) { h = 256; @@ -213,7 +229,7 @@ static inline u32 from_hash(u32 id) static unsigned long route4_get(struct tcf_proto *tp, u32 handle) { - struct route4_head *head = tp->root; + struct route4_head *head = rtnl_dereference(tp->root); struct route4_bucket *b; struct route4_filter *f; unsigned int h1, h2; @@ -229,9 +245,11 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) if (h2 > 32) return 0; - b = head->table[h1]; + b = rtnl_dereference(head->table[h1]); if (b) { - for (f = b->ht[h2]; f; f = f->next) + for (f = rtnl_dereference(b->ht[h2]); + f; + f = rtnl_dereference(f->next)) if (f->handle == handle) return (unsigned long)f; } @@ -248,8 +266,11 @@ static int route4_init(struct tcf_proto *tp) } static void -route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) +route4_delete_filter(struct rcu_head *head) { + struct route4_filter *f = container_of(head, struct route4_filter, rcu); + struct tcf_proto *tp = f->tp; + tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); kfree(f); @@ -257,7 +278,7 @@ route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) static void route4_destroy(struct tcf_proto *tp) { - struct route4_head *head = tp->root; + struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; if (head == NULL) @@ -266,28 +287,35 @@ static void route4_destroy(struct tcf_proto *tp) for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; - b = head->table[h1]; + b = rtnl_dereference(head->table[h1]); if (b) { for (h2 = 0; h2 <= 32; h2++) { struct route4_filter *f; - while ((f = b->ht[h2]) != NULL) { - b->ht[h2] = f->next; - route4_delete_filter(tp, f); + while ((f = rtnl_dereference(b->ht[h2])) != NULL) { + struct route4_filter *next; + + next = rtnl_dereference(f->next); + RCU_INIT_POINTER(b->ht[h2], next); + call_rcu(&f->rcu, route4_delete_filter); } } - kfree(b); + RCU_INIT_POINTER(head->table[h1], NULL); + kfree_rcu(b, rcu); } } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static int route4_delete(struct tcf_proto *tp, unsigned long arg) { - struct route4_head *head = tp->root; - struct route4_filter **fp, *f = (struct route4_filter *)arg; - unsigned int h = 0; + struct route4_head *head = rtnl_dereference(tp->root); + struct route4_filter *f = (struct route4_filter *)arg; + struct route4_filter __rcu **fp; + struct route4_filter *nf; struct route4_bucket *b; + unsigned int h = 0; int i; if (!head || !f) @@ -296,27 +324,35 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) h = f->handle; b = f->bkt; - for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) { - if (*fp == f) { - tcf_tree_lock(tp); - *fp = f->next; - tcf_tree_unlock(tp); + fp = &b->ht[from_hash(h >> 16)]; + for (nf = rtnl_dereference(*fp); nf; + fp = &nf->next, nf = rtnl_dereference(*fp)) { + if (nf == f) { + /* unlink it */ + RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); - route4_reset_fastmap(tp->q, head, f->id); - route4_delete_filter(tp, f); + /* Remove any fastmap lookups that might ref filter + * notice we unlink'd the filter so we can't get it + * back in the fastmap. + */ + route4_reset_fastmap(head); - /* Strip tree */ + /* Delete it */ + call_rcu(&f->rcu, route4_delete_filter); - for (i = 0; i <= 32; i++) - if (b->ht[i]) + /* Strip RTNL protected tree */ + for (i = 0; i <= 32; i++) { + struct route4_filter *rt; + + rt = rtnl_dereference(b->ht[i]); + if (rt) return 0; + } /* OK, session has no flows */ - tcf_tree_lock(tp); - head->table[to_hash(h)] = NULL; - tcf_tree_unlock(tp); + RCU_INIT_POINTER(head->table[to_hash(h)], NULL); + kfree_rcu(b, rcu); - kfree(b); return 0; } } @@ -380,26 +416,25 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, } h1 = to_hash(nhandle); - b = head->table[h1]; + b = rtnl_dereference(head->table[h1]); if (!b) { err = -ENOBUFS; b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); if (b == NULL) goto errout; - tcf_tree_lock(tp); - head->table[h1] = b; - tcf_tree_unlock(tp); + rcu_assign_pointer(head->table[h1], b); } else { unsigned int h2 = from_hash(nhandle >> 16); err = -EEXIST; - for (fp = b->ht[h2]; fp; fp = fp->next) + for (fp = rtnl_dereference(b->ht[h2]); + fp; + fp = rtnl_dereference(fp->next)) if (fp->handle == f->handle) goto errout; } - tcf_tree_lock(tp); if (tb[TCA_ROUTE4_TO]) f->id = to; @@ -410,7 +445,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, f->handle = nhandle; f->bkt = b; - tcf_tree_unlock(tp); + f->tp = tp; if (tb[TCA_ROUTE4_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); @@ -431,14 +466,15 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct route4_head *head = tp->root; - struct route4_filter *f, *f1, **fp; + struct route4_head *head = rtnl_dereference(tp->root); + struct route4_filter __rcu **fp; + struct route4_filter *fold, *f1, *pfp, *f = NULL; struct route4_bucket *b; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; - u32 old_handle = 0; int err; + bool new = true; if (opt == NULL) return handle ? -EINVAL : 0; @@ -447,70 +483,70 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - f = (struct route4_filter *)*arg; - if (f) { - if (f->handle != handle && handle) + fold = (struct route4_filter *)*arg; + if (fold && handle && fold->handle != handle) return -EINVAL; - if (f->bkt) - old_handle = f->handle; - - err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], 0, ovr); - if (err < 0) - return err; - - goto reinsert; - } - err = -ENOBUFS; if (head == NULL) { head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); if (head == NULL) goto errout; - - tcf_tree_lock(tp); - tp->root = head; - tcf_tree_unlock(tp); + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); - if (f == NULL) + if (!f) goto errout; tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + if (fold) { + f->id = fold->id; + f->iif = fold->iif; + f->res = fold->res; + f->handle = fold->handle; + + f->tp = fold->tp; + f->bkt = fold->bkt; + new = false; + } + err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], 1, ovr); + tca[TCA_RATE], new, ovr); if (err < 0) goto errout; -reinsert: h = from_hash(f->handle >> 16); - for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next) + fp = &f->bkt->ht[h]; + for (pfp = rtnl_dereference(*fp); + (f1 = rtnl_dereference(*fp)) != NULL; + fp = &f1->next) if (f->handle < f1->handle) break; - f->next = f1; - tcf_tree_lock(tp); - *fp = f; + rcu_assign_pointer(f->next, f1); + rcu_assign_pointer(*fp, f); - if (old_handle && f->handle != old_handle) { - th = to_hash(old_handle); - h = from_hash(old_handle >> 16); - b = head->table[th]; + if (fold && fold->handle && f->handle != fold->handle) { + th = to_hash(fold->handle); + h = from_hash(fold->handle >> 16); + b = rtnl_dereference(head->table[th]); if (b) { - for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { - if (*fp == f) { + fp = &b->ht[h]; + for (pfp = rtnl_dereference(*fp); pfp; + fp = &pfp->next, pfp = rtnl_dereference(*fp)) { + if (pfp == f) { *fp = f->next; break; } } } } - tcf_tree_unlock(tp); - route4_reset_fastmap(tp->q, head, f->id); + route4_reset_fastmap(head); *arg = (unsigned long)f; + if (fold) + call_rcu(&fold->rcu, route4_delete_filter); return 0; errout: @@ -520,7 +556,7 @@ errout: static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct route4_head *head = tp->root; + struct route4_head *head = rtnl_dereference(tp->root); unsigned int h, h1; if (head == NULL) @@ -530,13 +566,15 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) return; for (h = 0; h <= 256; h++) { - struct route4_bucket *b = head->table[h]; + struct route4_bucket *b = rtnl_dereference(head->table[h]); if (b) { for (h1 = 0; h1 <= 32; h1++) { struct route4_filter *f; - for (f = b->ht[h1]; f; f = f->next) { + for (f = rtnl_dereference(b->ht[h1]); + f; + f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; -- cgit v1.2.3 From 331b72922c5f58d48fd5500acadc91777cc31970 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:08:20 -0700 Subject: net: sched: RCU cls_tcindex Make cls_tcindex RCU safe. This patch addds a new RCU routine rcu_dereference_bh_rtnl() to check caller either holds the rcu read lock or RTNL. This is needed to handle the case where tcindex_lookup() is being called in both cases. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 10 ++ net/sched/cls_tcindex.c | 248 ++++++++++++++++++++++++++++------------------ 2 files changed, 164 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 167bae7bdfa4..6cacbce1a06c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -46,6 +46,16 @@ static inline int lockdep_rtnl_is_held(void) #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) +/** + * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking + * @p: The pointer to read, prior to dereference + * + * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh() + * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh() + */ +#define rcu_dereference_bh_rtnl(p) \ + rcu_dereference_bh_check(p, lockdep_rtnl_is_held()) + /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 3e9f76413b3b..a9f4279fbd69 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -32,19 +32,21 @@ struct tcindex_filter_result { struct tcindex_filter { u16 key; struct tcindex_filter_result result; - struct tcindex_filter *next; + struct tcindex_filter __rcu *next; + struct rcu_head rcu; }; struct tcindex_data { struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter **h; /* imperfect hash; only used if !perfect; - NULL if unused */ + struct tcindex_filter __rcu **h; /* imperfect hash; */ + struct tcf_proto *tp; u16 mask; /* AND key with mask */ - int shift; /* shift ANDed key to the right */ - int hash; /* hash table size; 0 if undefined */ - int alloc_hash; /* allocated size */ - int fall_through; /* 0: only classify if explicit match */ + u32 shift; /* shift ANDed key to the right */ + u32 hash; /* hash table size; 0 if undefined */ + u32 alloc_hash; /* allocated size */ + u32 fall_through; /* 0: only classify if explicit match */ + struct rcu_head rcu; }; static inline int @@ -56,13 +58,18 @@ tcindex_filter_is_set(struct tcindex_filter_result *r) static struct tcindex_filter_result * tcindex_lookup(struct tcindex_data *p, u16 key) { - struct tcindex_filter *f; + if (p->perfect) { + struct tcindex_filter_result *f = p->perfect + key; + + return tcindex_filter_is_set(f) ? f : NULL; + } else if (p->h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *f; - if (p->perfect) - return tcindex_filter_is_set(p->perfect + key) ? - p->perfect + key : NULL; - else if (p->h) { - for (f = p->h[key % p->hash]; f; f = f->next) + fp = &p->h[key % p->hash]; + for (f = rcu_dereference_bh_rtnl(*fp); + f; + fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) if (f->key == key) return &f->result; } @@ -74,7 +81,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rcu_dereference(tp->root); struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; @@ -99,7 +106,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r; pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -129,49 +136,59 @@ static int tcindex_init(struct tcf_proto *tp) p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; - tp->root = p; + rcu_assign_pointer(tp->root, p); return 0; } - static int -__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +tcindex_delete(struct tcf_proto *tp, unsigned long arg) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter __rcu **walk; struct tcindex_filter *f = NULL; - pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p); if (p->perfect) { if (!r->res.class) return -ENOENT; } else { int i; - struct tcindex_filter **walk = NULL; - for (i = 0; i < p->hash; i++) - for (walk = p->h+i; *walk; walk = &(*walk)->next) - if (&(*walk)->result == r) + for (i = 0; i < p->hash; i++) { + walk = p->h + i; + for (f = rtnl_dereference(*walk); f; + walk = &f->next, f = rtnl_dereference(*walk)) { + if (&f->result == r) goto found; + } + } return -ENOENT; found: - f = *walk; - if (lock) - tcf_tree_lock(tp); - *walk = f->next; - if (lock) - tcf_tree_unlock(tp); + rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); tcf_exts_destroy(tp, &r->exts); - kfree(f); + if (f) + kfree_rcu(f, rcu); return 0; } -static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, + struct tcf_walker *walker) { - return __tcindex_delete(tp, arg, 1); + return tcindex_delete(tp, arg); +} + +static void __tcindex_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p->h); + kfree(p); } static inline int @@ -194,6 +211,14 @@ static void tcindex_filter_result_init(struct tcindex_filter_result *r) tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } +static void __tcindex_partial_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p); +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, @@ -203,7 +228,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data cp; + struct tcindex_data *cp, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; @@ -212,84 +237,118 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - memcpy(&cp, p, sizeof(cp)); - tcindex_filter_result_init(&new_filter_result); + /* tcindex_data attributes must look atomic to classifier/lookup so + * allocate new tcindex data and RCU assign it onto root. Keeping + * perfect hash and hash pointers from old data. + */ + cp = kzalloc(sizeof(cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->mask = p->mask; + cp->shift = p->shift; + cp->hash = p->hash; + cp->alloc_hash = p->alloc_hash; + cp->fall_through = p->fall_through; + cp->tp = tp; + + if (p->perfect) { + cp->perfect = kmemdup(p->perfect, + sizeof(*r) * cp->hash, GFP_KERNEL); + if (!cp->perfect) + goto errout; + } + cp->h = p->h; + + memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcf_exts_init(&new_filter_result.exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); tcindex_filter_result_init(&cr); if (old_r) cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) - cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); if (tb[TCA_TCINDEX_MASK]) - cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); if (tb[TCA_TCINDEX_SHIFT]) - cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); err = -EBUSY; + /* Hash already allocated, make sure that we still meet the * requirements for the allocated hash. */ - if (cp.perfect) { - if (!valid_perfect_hash(&cp) || - cp.hash > cp.alloc_hash) + if (cp->perfect) { + if (!valid_perfect_hash(cp) || + cp->hash > cp->alloc_hash) goto errout; - } else if (cp.h && cp.hash != cp.alloc_hash) + } else if (cp->h && cp->hash != cp->alloc_hash) { goto errout; + } err = -EINVAL; if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); + cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - if (!cp.hash) { + if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. */ - if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) - cp.hash = (cp.mask >> cp.shift) + 1; + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) + cp->hash = (cp->mask >> cp->shift) + 1; else - cp.hash = DEFAULT_HASH_SIZE; + cp->hash = DEFAULT_HASH_SIZE; } - if (!cp.perfect && !cp.h) - cp.alloc_hash = cp.hash; + if (!cp->perfect && cp->h) + cp->alloc_hash = cp->hash; /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) * but then, we'd fail handles that may become valid after some future * mask change. While this is extremely unlikely to ever matter, * the check below is safer (and also more backwards-compatible). */ - if (cp.perfect || valid_perfect_hash(&cp)) - if (handle >= cp.alloc_hash) + if (cp->perfect || valid_perfect_hash(cp)) + if (handle >= cp->alloc_hash) goto errout; err = -ENOMEM; - if (!cp.perfect && !cp.h) { - if (valid_perfect_hash(&cp)) { + if (!cp->perfect && !cp->h) { + if (valid_perfect_hash(cp)) { int i; - cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); - if (!cp.perfect) + cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); + if (!cp->perfect) goto errout; - for (i = 0; i < cp.hash; i++) - tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + for (i = 0; i < cp->hash; i++) + tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); balloc = 1; } else { - cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); - if (!cp.h) + struct tcindex_filter __rcu **hash; + + hash = kcalloc(cp->hash, + sizeof(struct tcindex_filter *), + GFP_KERNEL); + + if (!hash) goto errout; + + cp->h = hash; balloc = 2; } } - if (cp.perfect) - r = cp.perfect + handle; + if (cp->perfect) + r = cp->perfect + handle; else - r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + r = tcindex_lookup(cp, handle) ? : &new_filter_result; if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -307,33 +366,41 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - tcf_tree_lock(tp); if (old_r && old_r != r) tcindex_filter_result_init(old_r); - memcpy(p, &cp, sizeof(cp)); + oldp = p; r->res = cr.res; + rcu_assign_pointer(tp->root, cp); if (r == &new_filter_result) { - struct tcindex_filter **fp; + struct tcindex_filter *nfp; + struct tcindex_filter __rcu **fp; f->key = handle; f->result = new_filter_result; f->next = NULL; - for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) - /* nothing */; - *fp = f; + + fp = p->h + (handle % p->hash); + for (nfp = rtnl_dereference(*fp); + nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) + ; /* nothing */ + + rcu_assign_pointer(*fp, f); } - tcf_tree_unlock(tp); + if (oldp) + call_rcu(&oldp->rcu, __tcindex_partial_destroy); return 0; errout_alloc: if (balloc == 1) - kfree(cp.perfect); + kfree(cp->perfect); else if (balloc == 2) - kfree(cp.h); + kfree(cp->h); errout: + kfree(cp); tcf_exts_destroy(tp, &e); return err; } @@ -345,7 +412,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; int err; @@ -364,10 +431,9 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, tca[TCA_RATE], ovr); } - static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter *f, *next; int i; @@ -390,8 +456,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) if (!p->h) return; for (i = 0; i < p->hash; i++) { - for (f = p->h[i]; f; f = next) { - next = f->next; + for (f = rtnl_dereference(p->h[i]); f; f = next) { + next = rtnl_dereference(f->next); if (walker->count >= walker->skip) { if (walker->fn(tp, (unsigned long) &f->result, walker) < 0) { @@ -404,17 +470,9 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } - -static int tcindex_destroy_element(struct tcf_proto *tp, - unsigned long arg, struct tcf_walker *walker) -{ - return __tcindex_delete(tp, arg, 0); -} - - static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -422,17 +480,16 @@ static void tcindex_destroy(struct tcf_proto *tp) walker.skip = 0; walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); - kfree(p->perfect); - kfree(p->h); - kfree(p); - tp->root = NULL; + + RCU_INIT_POINTER(tp->root, NULL); + call_rcu(&p->rcu, __tcindex_destroy); } static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -455,15 +512,18 @@ static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_nest_end(skb, nest); } else { if (p->perfect) { - t->tcm_handle = r-p->perfect; + t->tcm_handle = r - p->perfect; } else { struct tcindex_filter *f; + struct tcindex_filter __rcu **fp; int i; t->tcm_handle = 0; for (i = 0; !t->tcm_handle && i < p->hash; i++) { - for (f = p->h[i]; !t->tcm_handle && f; - f = f->next) { + fp = &p->h[i]; + for (f = rtnl_dereference(*fp); + !t->tcm_handle && f; + fp = &f->next, f = rtnl_dereference(*fp)) { if (&f->result == r) t->tcm_handle = f->key; } -- cgit v1.2.3 From 459d5f626da75573e985a7197b0919c3b143146c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:08:47 -0700 Subject: net: sched: make cls_u32 per cpu This uses per cpu counters in cls_u32 in preparation to convert over to rcu. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 75 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 70c0be8d0121..f3227d73a7ae 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -55,10 +55,12 @@ struct tc_u_knode { struct tcf_result res; struct tc_u_hnode *ht_down; #ifdef CONFIG_CLS_U32_PERF - struct tc_u32_pcnt *pf; + struct tc_u32_pcnt __percpu *pf; #endif #ifdef CONFIG_CLS_U32_MARK - struct tc_u32_mark mark; + u32 val; + u32 mask; + u32 __percpu *pcpu_success; #endif struct tc_u32_sel sel; }; @@ -115,16 +117,16 @@ next_knode: struct tc_u32_key *key = n->sel.keys; #ifdef CONFIG_CLS_U32_PERF - n->pf->rcnt += 1; + __this_cpu_inc(n->pf->rcnt); j = 0; #endif #ifdef CONFIG_CLS_U32_MARK - if ((skb->mark & n->mark.mask) != n->mark.val) { + if ((skb->mark & n->mask) != n->val) { n = n->next; goto next_knode; } else { - n->mark.success++; + __this_cpu_inc(*n->pcpu_success); } #endif @@ -143,7 +145,7 @@ next_knode: goto next_knode; } #ifdef CONFIG_CLS_U32_PERF - n->pf->kcnts[j] += 1; + __this_cpu_inc(n->pf->kcnts[j]); j++; #endif } @@ -159,7 +161,7 @@ check_terminal: } #endif #ifdef CONFIG_CLS_U32_PERF - n->pf->rhit += 1; + __this_cpu_inc(n->pf->rhit); #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { @@ -342,7 +344,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - kfree(n->pf); + free_percpu(n->pf); #endif kfree(n); return 0; @@ -564,6 +566,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_U32_MAX + 1]; u32 htid; int err; +#ifdef CONFIG_CLS_U32_PERF + size_t size; +#endif if (opt == NULL) return handle ? -EINVAL : 0; @@ -642,8 +647,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; #ifdef CONFIG_CLS_U32_PERF - n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); - if (n->pf == NULL) { + size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); + n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); + if (!n->pf) { kfree(n); return -ENOBUFS; } @@ -656,12 +662,14 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); #ifdef CONFIG_CLS_U32_MARK + n->pcpu_success = alloc_percpu(u32); + if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; mark = nla_data(tb[TCA_U32_MARK]); - memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); - n->mark.success = 0; + n->val = mark->val; + n->mask = mark->mask; } #endif @@ -745,6 +753,11 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) goto nla_put_failure; } else { +#ifdef CONFIG_CLS_U32_PERF + struct tc_u32_pcnt *gpf; +#endif + int cpu; + if (nla_put(skb, TCA_U32_SEL, sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), &n->sel)) @@ -762,9 +775,20 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK - if ((n->mark.val || n->mark.mask) && - nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark)) - goto nla_put_failure; + if ((n->val || n->mask)) { + struct tc_u32_mark mark = {.val = n->val, + .mask = n->mask, + .success = 0}; + + for_each_possible_cpu(cpu) { + __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpu); + + mark.success += cnt; + } + + if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark)) + goto nla_put_failure; + } #endif if (tcf_exts_dump(skb, &n->exts) < 0) @@ -779,10 +803,29 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, } #endif #ifdef CONFIG_CLS_U32_PERF + gpf = kzalloc(sizeof(struct tc_u32_pcnt) + + n->sel.nkeys * sizeof(u64), + GFP_KERNEL); + if (!gpf) + goto nla_put_failure; + + for_each_possible_cpu(cpu) { + int i; + struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu); + + gpf->rcnt += pf->rcnt; + gpf->rhit += pf->rhit; + for (i = 0; i < n->sel.nkeys; i++) + gpf->kcnts[i] += pf->kcnts[i]; + } + if (nla_put(skb, TCA_U32_PCNT, sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - n->pf)) + gpf)) { + kfree(gpf); goto nla_put_failure; + } + kfree(gpf); #endif } -- cgit v1.2.3 From 1ce87720d456e471de0fbd814dc5d1fe10fc1c44 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:09:16 -0700 Subject: net: sched: make cls_u32 lockless Make cls_u32 classifier safe to run without holding lock. This patch converts statistics that are kept in read section u32_classify into per cpu counters. This patch was tested with a tight u32 filter add/delete loop while generating traffic with pktgen. By running pktgen on vlan devices created on top of a physical device we can hit the qdisc layer correctly. For ingress qdisc's a loopback cable was used. for i in {1..100}; do q=`echo $i%8|bc`; echo -n "u32 tos: iteration $i on queue $q"; tc filter add dev p3p2 parent $p prio $i u32 match ip tos 0x10 0xff \ action skbedit queue_mapping $q; sleep 1; tc filter del dev p3p2 prio $i; echo -n "u32 tos hash table: iteration $i on queue $q"; tc filter add dev p3p2 parent $p protocol ip prio $i handle 628: u32 divisor 1 tc filter add dev p3p2 parent $p protocol ip prio $i u32 \ match ip protocol 17 0xff link 628: offset at 0 mask 0xf00 shift 6 plus 0 tc filter add dev p3p2 parent $p protocol ip prio $i u32 \ ht 628:0 match ip tos 0x10 0xff action skbedit queue_mapping $q sleep 2; tc filter del dev p3p2 prio $i sleep 1; done Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 183 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 110 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index f3227d73a7ae..5ed5ac4361b1 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -44,16 +45,16 @@ #include struct tc_u_knode { - struct tc_u_knode *next; + struct tc_u_knode __rcu *next; u32 handle; - struct tc_u_hnode *ht_up; + struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; #ifdef CONFIG_NET_CLS_IND int ifindex; #endif u8 fshift; struct tcf_result res; - struct tc_u_hnode *ht_down; + struct tc_u_hnode __rcu *ht_down; #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt __percpu *pf; #endif @@ -62,24 +63,28 @@ struct tc_u_knode { u32 mask; u32 __percpu *pcpu_success; #endif + struct tcf_proto *tp; struct tc_u32_sel sel; + struct rcu_head rcu; }; struct tc_u_hnode { - struct tc_u_hnode *next; + struct tc_u_hnode __rcu *next; u32 handle; u32 prio; struct tc_u_common *tp_c; int refcnt; unsigned int divisor; - struct tc_u_knode *ht[1]; + struct tc_u_knode __rcu *ht[1]; + struct rcu_head rcu; }; struct tc_u_common { - struct tc_u_hnode *hlist; + struct tc_u_hnode __rcu *hlist; struct Qdisc *q; int refcnt; u32 hgenerator; + struct rcu_head rcu; }; static inline unsigned int u32_hash_fold(__be32 key, @@ -98,7 +103,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct unsigned int off; } stack[TC_U32_MAXDEPTH]; - struct tc_u_hnode *ht = tp->root; + struct tc_u_hnode *ht = rcu_dereference_bh(tp->root); unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; @@ -110,7 +115,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct int i, r; next_ht: - n = ht->ht[sel]; + n = rcu_dereference_bh(ht->ht[sel]); next_knode: if (n) { @@ -123,7 +128,7 @@ next_knode: #ifdef CONFIG_CLS_U32_MARK if ((skb->mark & n->mask) != n->val) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } else { __this_cpu_inc(*n->pcpu_success); @@ -141,7 +146,7 @@ next_knode: if (!data) goto out; if ((*data ^ key->val) & key->mask) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_PERF @@ -149,14 +154,16 @@ next_knode: j++; #endif } - if (n->ht_down == NULL) { + + ht = rcu_dereference_bh(n->ht_down); + if (!ht) { check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; #ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, n->ifindex)) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } #endif @@ -165,13 +172,13 @@ check_terminal: #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } return r; } - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } @@ -182,7 +189,7 @@ check_terminal: stack[sdepth].off = off; sdepth++; - ht = n->ht_down; + ht = rcu_dereference_bh(n->ht_down); sel = 0; if (ht->divisor) { __be32 *data, hdata; @@ -224,7 +231,7 @@ check_terminal: /* POP */ if (sdepth--) { n = stack[sdepth].knode; - ht = n->ht_up; + ht = rcu_dereference_bh(n->ht_up); off = stack[sdepth].off; goto check_terminal; } @@ -241,7 +248,9 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; - for (ht = tp_c->hlist; ht; ht = ht->next) + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) if (ht->handle == handle) break; @@ -258,7 +267,9 @@ u32_lookup_key(struct tc_u_hnode *ht, u32 handle) if (sel > ht->divisor) goto out; - for (n = ht->ht[sel]; n; n = n->next) + for (n = rtnl_dereference(ht->ht[sel]); + n; + n = rtnl_dereference(n->next)) if (n->handle == handle) break; out: @@ -272,7 +283,7 @@ static unsigned long u32_get(struct tcf_proto *tp, u32 handle) struct tc_u_common *tp_c = tp->data; if (TC_U32_HTID(handle) == TC_U32_ROOT) - ht = tp->root; + ht = rtnl_dereference(tp->root); else ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); @@ -293,6 +304,9 @@ static u32 gen_new_htid(struct tc_u_common *tp_c) { int i = 0x800; + /* hgenerator only used inside rtnl lock it is safe to increment + * without read _copy_ update semantics + */ do { if (++tp_c->hgenerator == 0x7FF) tp_c->hgenerator = 1; @@ -328,11 +342,11 @@ static int u32_init(struct tcf_proto *tp) } tp_c->refcnt++; - root_ht->next = tp_c->hlist; - tp_c->hlist = root_ht; + RCU_INIT_POINTER(root_ht->next, tp_c->hlist); + rcu_assign_pointer(tp_c->hlist, root_ht); root_ht->tp_c = tp_c; - tp->root = root_ht; + rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; } @@ -350,19 +364,27 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) return 0; } +static void u32_delete_key_rcu(struct rcu_head *rcu) +{ + struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); + + u32_destroy_key(key->tp, key); +} + static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { - struct tc_u_knode **kp; + struct tc_u_knode __rcu **kp; + struct tc_u_knode *pkp; struct tc_u_hnode *ht = key->ht_up; if (ht) { - for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { - if (*kp == key) { - tcf_tree_lock(tp); - *kp = key->next; - tcf_tree_unlock(tp); + kp = &ht->ht[TC_U32_HASH(key->handle)]; + for (pkp = rtnl_dereference(*kp); pkp; + kp = &pkp->next, pkp = rtnl_dereference(*kp)) { + if (pkp == key) { + RCU_INIT_POINTER(*kp, key->next); - u32_destroy_key(tp, key); + call_rcu(&key->rcu, u32_delete_key_rcu); return 0; } } @@ -371,16 +393,16 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } -static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +static void u32_clear_hnode(struct tc_u_hnode *ht) { struct tc_u_knode *n; unsigned int h; for (h = 0; h <= ht->divisor; h++) { - while ((n = ht->ht[h]) != NULL) { - ht->ht[h] = n->next; - - u32_destroy_key(tp, n); + while ((n = rtnl_dereference(ht->ht[h])) != NULL) { + RCU_INIT_POINTER(ht->ht[h], + rtnl_dereference(n->next)); + call_rcu(&n->rcu, u32_delete_key_rcu); } } } @@ -388,28 +410,31 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_common *tp_c = tp->data; - struct tc_u_hnode **hn; + struct tc_u_hnode __rcu **hn; + struct tc_u_hnode *phn; WARN_ON(ht->refcnt); - u32_clear_hnode(tp, ht); + u32_clear_hnode(ht); - for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { - if (*hn == ht) { - *hn = ht->next; - kfree(ht); + hn = &tp_c->hlist; + for (phn = rtnl_dereference(*hn); + phn; + hn = &phn->next, phn = rtnl_dereference(*hn)) { + if (phn == ht) { + RCU_INIT_POINTER(*hn, ht->next); + kfree_rcu(ht, rcu); return 0; } } - WARN_ON(1); return -ENOENT; } static void u32_destroy(struct tcf_proto *tp) { struct tc_u_common *tp_c = tp->data; - struct tc_u_hnode *root_ht = tp->root; + struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); WARN_ON(root_ht == NULL); @@ -421,17 +446,16 @@ static void u32_destroy(struct tcf_proto *tp) tp->q->u32_node = NULL; - for (ht = tp_c->hlist; ht; ht = ht->next) { + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { ht->refcnt--; - u32_clear_hnode(tp, ht); + u32_clear_hnode(ht); } - while ((ht = tp_c->hlist) != NULL) { - tp_c->hlist = ht->next; - - WARN_ON(ht->refcnt != 0); - - kfree(ht); + while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { + RCU_INIT_POINTER(tp_c->hlist, ht->next); + kfree_rcu(ht, rcu); } kfree(tp_c); @@ -443,6 +467,7 @@ static void u32_destroy(struct tcf_proto *tp) static int u32_delete(struct tcf_proto *tp, unsigned long arg) { struct tc_u_hnode *ht = (struct tc_u_hnode *)arg; + struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); if (ht == NULL) return 0; @@ -450,7 +475,7 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) if (TC_U32_KEY(ht->handle)) return u32_delete_key(tp, (struct tc_u_knode *)ht); - if (tp->root == ht) + if (root_ht == ht) return -EINVAL; if (ht->refcnt == 1) { @@ -473,7 +498,9 @@ static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) if (!bitmap) return handle | 0xFFF; - for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); + n; + n = rtnl_dereference(n->next)) set_bit(TC_U32_NODE(n->handle), bitmap); i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); @@ -523,10 +550,8 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, ht_down->refcnt++; } - tcf_tree_lock(tp); - ht_old = n->ht_down; - n->ht_down = ht_down; - tcf_tree_unlock(tp); + ht_old = rtnl_dereference(n->ht_down); + rcu_assign_pointer(n->ht_down, ht_down); if (ht_old) ht_old->refcnt--; @@ -606,8 +631,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; - ht->next = tp_c->hlist; - tp_c->hlist = ht; + RCU_INIT_POINTER(ht->next, tp_c->hlist); + rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; return 0; } @@ -615,7 +640,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_U32_HASH]) { htid = nla_get_u32(tb[TCA_U32_HASH]); if (TC_U32_HTID(htid) == TC_U32_ROOT) { - ht = tp->root; + ht = rtnl_dereference(tp->root); htid = ht->handle; } else { ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); @@ -623,7 +648,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } } else { - ht = tp->root; + ht = rtnl_dereference(tp->root); htid = ht->handle; } @@ -660,6 +685,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); + n->tp = tp; #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); @@ -675,21 +701,23 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); if (err == 0) { - struct tc_u_knode **ins; - for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) - if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) + struct tc_u_knode __rcu **ins; + struct tc_u_knode *pins; + + ins = &ht->ht[TC_U32_HASH(handle)]; + for (pins = rtnl_dereference(*ins); pins; + ins = &pins->next, pins = rtnl_dereference(*ins)) + if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle)) break; - n->next = *ins; - tcf_tree_lock(tp); - *ins = n; - tcf_tree_unlock(tp); + RCU_INIT_POINTER(n->next, pins); + rcu_assign_pointer(*ins, n); *arg = (unsigned long)n; return 0; } #ifdef CONFIG_CLS_U32_PERF - kfree(n->pf); + free_percpu(n->pf); #endif kfree(n); return err; @@ -705,7 +733,9 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) if (arg->stop) return; - for (ht = tp_c->hlist; ht; ht = ht->next) { + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; if (arg->count >= arg->skip) { @@ -716,7 +746,9 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } arg->count++; for (h = 0; h <= ht->divisor; h++) { - for (n = ht->ht[h]; n; n = n->next) { + for (n = rtnl_dereference(ht->ht[h]); + n; + n = rtnl_dereference(n->next)) { if (arg->count < arg->skip) { arg->count++; continue; @@ -735,6 +767,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode *)fh; + struct tc_u_hnode *ht_up, *ht_down; struct nlattr *nest; if (n == NULL) @@ -762,7 +795,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), &n->sel)) goto nla_put_failure; - if (n->ht_up) { + + ht_up = rtnl_dereference(n->ht_up); + if (ht_up) { u32 htid = n->handle & 0xFFFFF000; if (nla_put_u32(skb, TCA_U32_HASH, htid)) goto nla_put_failure; @@ -770,8 +805,10 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (n->res.classid && nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) goto nla_put_failure; - if (n->ht_down && - nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle)) + + ht_down = rtnl_dereference(n->ht_down); + if (ht_down && + nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK -- cgit v1.2.3 From b929d86d25352496c528fcd74fdcabe3f6a4994a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:09:49 -0700 Subject: net: sched: rcu'ify cls_rsvp Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_rsvp.h | 160 +++++++++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 1020e233a5d6..b044c208b133 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -70,31 +70,34 @@ struct rsvp_head { u32 tmap[256/32]; u32 hgenerator; u8 tgenerator; - struct rsvp_session *ht[256]; + struct rsvp_session __rcu *ht[256]; + struct rcu_head rcu; }; struct rsvp_session { - struct rsvp_session *next; - __be32 dst[RSVP_DST_LEN]; - struct tc_rsvp_gpi dpi; - u8 protocol; - u8 tunnelid; + struct rsvp_session __rcu *next; + __be32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter *ht[16 + 1]; + struct rsvp_filter __rcu *ht[16 + 1]; + struct rcu_head rcu; }; struct rsvp_filter { - struct rsvp_filter *next; - __be32 src[RSVP_DST_LEN]; - struct tc_rsvp_gpi spi; - u8 tunnelhdr; + struct rsvp_filter __rcu *next; + __be32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; - struct tcf_result res; - struct tcf_exts exts; + struct tcf_result res; + struct tcf_exts exts; - u32 handle; - struct rsvp_session *sess; + u32 handle; + struct rsvp_session *sess; + struct rcu_head rcu; }; static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) @@ -128,7 +131,7 @@ static inline unsigned int hash_src(__be32 *src) static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; + struct rsvp_head *head = rcu_dereference_bh(tp->root); struct rsvp_session *s; struct rsvp_filter *f; unsigned int h1, h2; @@ -169,7 +172,8 @@ restart: h1 = hash_dst(dst, protocol, tunnelid); h2 = hash_src(src); - for (s = sht[h1]; s; s = s->next) { + for (s = rcu_dereference_bh(head->ht[h1]); s; + s = rcu_dereference_bh(s->next)) { if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && protocol == s->protocol && !(s->dpi.mask & @@ -181,7 +185,8 @@ restart: #endif tunnelid == s->tunnelid) { - for (f = s->ht[h2]; f; f = f->next) { + for (f = rcu_dereference_bh(s->ht[h2]); f; + f = rcu_dereference_bh(f->next)) { if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) #if RSVP_DST_LEN == 4 @@ -205,7 +210,8 @@ matched: } /* And wildcard bucket... */ - for (f = s->ht[16]; f; f = f->next) { + for (f = rcu_dereference_bh(s->ht[16]); f; + f = rcu_dereference_bh(f->next)) { *res = f->res; RSVP_APPLY_RESULT(); goto matched; @@ -218,7 +224,7 @@ matched: static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) { - struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; + struct rsvp_head *head = rtnl_dereference(tp->root); struct rsvp_session *s; struct rsvp_filter *f; unsigned int h1 = handle & 0xFF; @@ -227,8 +233,10 @@ static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) if (h2 > 16) return 0; - for (s = sht[h1]; s; s = s->next) { - for (f = s->ht[h2]; f; f = f->next) { + for (s = rtnl_dereference(head->ht[h1]); s; + s = rtnl_dereference(s->next)) { + for (f = rtnl_dereference(s->ht[h2]); f; + f = rtnl_dereference(f->next)) { if (f->handle == handle) return (unsigned long)f; } @@ -246,7 +254,7 @@ static int rsvp_init(struct tcf_proto *tp) data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); if (data) { - tp->root = data; + rcu_assign_pointer(tp->root, data); return 0; } return -ENOBUFS; @@ -257,53 +265,54 @@ rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); - kfree(f); + kfree_rcu(f, rcu); } static void rsvp_destroy(struct tcf_proto *tp) { - struct rsvp_head *data = xchg(&tp->root, NULL); - struct rsvp_session **sht; + struct rsvp_head *data = rtnl_dereference(tp->root); int h1, h2; if (data == NULL) return; - sht = data->ht; + RCU_INIT_POINTER(tp->root, NULL); for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; - while ((s = sht[h1]) != NULL) { - sht[h1] = s->next; + while ((s = rtnl_dereference(data->ht[h1])) != NULL) { + RCU_INIT_POINTER(data->ht[h1], s->next); for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; - while ((f = s->ht[h2]) != NULL) { - s->ht[h2] = f->next; + while ((f = rtnl_dereference(s->ht[h2])) != NULL) { + rcu_assign_pointer(s->ht[h2], f->next); rsvp_delete_filter(tp, f); } } - kfree(s); + kfree_rcu(s, rcu); } } - kfree(data); + kfree_rcu(data, rcu); } static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) { - struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg; + struct rsvp_head *head = rtnl_dereference(tp->root); + struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg; + struct rsvp_filter __rcu **fp; unsigned int h = f->handle; - struct rsvp_session **sp; - struct rsvp_session *s = f->sess; + struct rsvp_session __rcu **sp; + struct rsvp_session *nsp, *s = f->sess; int i; - for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) { - if (*fp == f) { - tcf_tree_lock(tp); - *fp = f->next; - tcf_tree_unlock(tp); + fp = &s->ht[(h >> 8) & 0xFF]; + for (nfp = rtnl_dereference(*fp); nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) { + if (nfp == f) { + RCU_INIT_POINTER(*fp, f->next); rsvp_delete_filter(tp, f); /* Strip tree */ @@ -313,14 +322,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) return 0; /* OK, session has no flows */ - for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF]; - *sp; sp = &(*sp)->next) { - if (*sp == s) { - tcf_tree_lock(tp); - *sp = s->next; - tcf_tree_unlock(tp); - - kfree(s); + sp = &head->ht[h & 0xFF]; + for (nsp = rtnl_dereference(*sp); nsp; + sp = &nsp->next, nsp = rtnl_dereference(*sp)) { + if (nsp == s) { + RCU_INIT_POINTER(*sp, s->next); + kfree_rcu(s, rcu); return 0; } } @@ -333,7 +340,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) { - struct rsvp_head *data = tp->root; + struct rsvp_head *data = rtnl_dereference(tp->root); int i = 0xFFFF; while (i-- > 0) { @@ -361,7 +368,7 @@ static int tunnel_bts(struct rsvp_head *data) static void tunnel_recycle(struct rsvp_head *data) { - struct rsvp_session **sht = data->ht; + struct rsvp_session __rcu **sht = data->ht; u32 tmap[256/32]; int h1, h2; @@ -369,11 +376,13 @@ static void tunnel_recycle(struct rsvp_head *data) for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; - for (s = sht[h1]; s; s = s->next) { + for (s = rtnl_dereference(sht[h1]); s; + s = rtnl_dereference(s->next)) { for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; - for (f = s->ht[h2]; f; f = f->next) { + for (f = rtnl_dereference(s->ht[h2]); f; + f = rtnl_dereference(f->next)) { if (f->tunnelhdr == 0) continue; data->tgenerator = f->res.classid; @@ -417,9 +426,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct rsvp_head *data = tp->root; - struct rsvp_filter *f, **fp; - struct rsvp_session *s, **sp; + struct rsvp_head *data = rtnl_dereference(tp->root); + struct rsvp_filter *f, *nfp; + struct rsvp_filter __rcu **fp; + struct rsvp_session *nsp, *s; + struct rsvp_session __rcu **sp; struct tc_rsvp_pinfo *pinfo = NULL; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_RSVP_MAX + 1]; @@ -499,7 +510,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, goto errout; } - for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) { + for (sp = &data->ht[h1]; + (s = rtnl_dereference(*sp)) != NULL; + sp = &s->next) { if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && pinfo && pinfo->protocol == s->protocol && memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && @@ -521,12 +534,16 @@ insert: tcf_exts_change(tp, &f->exts, &e); - for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) - if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask) + fp = &s->ht[h2]; + for (nfp = rtnl_dereference(*fp); nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) { + __u32 mask = nfp->spi.mask & f->spi.mask; + + if (mask != f->spi.mask) break; - f->next = *fp; - wmb(); - *fp = f; + } + RCU_INIT_POINTER(f->next, nfp); + rcu_assign_pointer(*fp, f); *arg = (unsigned long)f; return 0; @@ -546,13 +563,14 @@ insert: s->protocol = pinfo->protocol; s->tunnelid = pinfo->tunnelid; } - for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { - if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + sp = &data->ht[h1]; + for (nsp = rtnl_dereference(*sp); nsp; + sp = &nsp->next, nsp = rtnl_dereference(*sp)) { + if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask) break; } - s->next = *sp; - wmb(); - *sp = s; + RCU_INIT_POINTER(s->next, nsp); + rcu_assign_pointer(*sp, s); goto insert; @@ -565,7 +583,7 @@ errout2: static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct rsvp_head *head = tp->root; + struct rsvp_head *head = rtnl_dereference(tp->root); unsigned int h, h1; if (arg->stop) @@ -574,11 +592,13 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) for (h = 0; h < 256; h++) { struct rsvp_session *s; - for (s = head->ht[h]; s; s = s->next) { + for (s = rtnl_dereference(head->ht[h]); s; + s = rtnl_dereference(s->next)) { for (h1 = 0; h1 <= 16; h1++) { struct rsvp_filter *f; - for (f = s->ht[h1]; f; f = f->next) { + for (f = rtnl_dereference(s->ht[h1]); f; + f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; -- cgit v1.2.3 From 1f947bf151e90ec0baad294881607ebf321a2863 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:10:24 -0700 Subject: net: sched: rcu'ify cls_bpf This patch makes the cls_bpf classifier RCU safe. The tcf_lock was being used to protect a list of cls_bpf_prog now this list is RCU safe and updates occur with rcu_replace. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 94 ++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 0e30d58149da..6a7386e6e5a8 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -27,6 +27,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; u32 hgen; + struct rcu_head rcu; }; struct cls_bpf_prog { @@ -37,6 +38,8 @@ struct cls_bpf_prog { struct list_head link; u32 handle; u16 bpf_len; + struct tcf_proto *tp; + struct rcu_head rcu; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { @@ -49,11 +52,11 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rcu_dereference(tp->root); struct cls_bpf_prog *prog; int ret; - list_for_each_entry(prog, &head->plist, link) { + list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res = BPF_PROG_RUN(prog->filter, skb); if (filter_res == 0) @@ -81,8 +84,8 @@ static int cls_bpf_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; - INIT_LIST_HEAD(&head->plist); - tp->root = head; + INIT_LIST_HEAD_RCU(&head->plist); + rcu_assign_pointer(tp->root, head); return 0; } @@ -98,18 +101,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) kfree(prog); } +static void __cls_bpf_delete_prog(struct rcu_head *rcu) +{ + struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); + + cls_bpf_delete_prog(prog->tp, prog); +} + static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; list_for_each_entry(prog, &head->plist, link) { if (prog == todel) { - tcf_tree_lock(tp); - list_del(&prog->link); - tcf_tree_unlock(tp); - - cls_bpf_delete_prog(tp, prog); + list_del_rcu(&prog->link); + call_rcu(&prog->rcu, __cls_bpf_delete_prog); return 0; } } @@ -119,27 +126,28 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) static void cls_bpf_destroy(struct tcf_proto *tp) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; list_for_each_entry_safe(prog, tmp, &head->plist, link) { - list_del(&prog->link); - cls_bpf_delete_prog(tp, prog); + list_del_rcu(&prog->link); + call_rcu(&prog->rcu, __cls_bpf_delete_prog); } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; unsigned long ret = 0UL; if (head == NULL) return 0UL; - list_for_each_entry(prog, &head->plist, link) { + list_for_each_entry_rcu(prog, &head->plist, link) { if (prog->handle == handle) { ret = (unsigned long) prog; break; @@ -158,10 +166,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { - struct sock_filter *bpf_ops, *bpf_old; + struct sock_filter *bpf_ops; struct tcf_exts exts; struct sock_fprog_kern tmp; - struct bpf_prog *fp, *fp_old; + struct bpf_prog *fp; u16 bpf_size, bpf_len; u32 classid; int ret; @@ -197,26 +205,15 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if (ret) goto errout_free; - tcf_tree_lock(tp); - fp_old = prog->filter; - bpf_old = prog->bpf_ops; - prog->bpf_len = bpf_len; prog->bpf_ops = bpf_ops; prog->filter = fp; prog->res.classid = classid; - tcf_tree_unlock(tp); tcf_bind_filter(tp, &prog->res, base); tcf_exts_change(tp, &prog->exts, &exts); - if (fp_old) - bpf_prog_destroy(fp_old); - if (bpf_old) - kfree(bpf_old); - return 0; - errout_free: kfree(bpf_ops); errout: @@ -244,9 +241,10 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, u32 handle, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct cls_bpf_head *head = tp->root; - struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; + struct cls_bpf_head *head = rtnl_dereference(tp->root); + struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; + struct cls_bpf_prog *prog; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -256,18 +254,19 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) return ret; - if (prog != NULL) { - if (handle && prog->handle != handle) - return -EINVAL; - return cls_bpf_modify_existing(net, tp, prog, base, tb, - tca[TCA_RATE], ovr); - } - prog = kzalloc(sizeof(*prog), GFP_KERNEL); - if (prog == NULL) + if (!prog) return -ENOBUFS; tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + + if (oldprog) { + if (handle && oldprog->handle != handle) { + ret = -EINVAL; + goto errout; + } + } + if (handle == 0) prog->handle = cls_bpf_grab_new_handle(tp, head); else @@ -281,16 +280,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) goto errout; - tcf_tree_lock(tp); - list_add(&prog->link, &head->plist); - tcf_tree_unlock(tp); + if (oldprog) { + list_replace_rcu(&prog->link, &oldprog->link); + call_rcu(&oldprog->rcu, __cls_bpf_delete_prog); + } else { + list_add_rcu(&prog->link, &head->plist); + } *arg = (unsigned long) prog; - return 0; errout: - if (*arg == 0UL && prog) - kfree(prog); + kfree(prog); return ret; } @@ -339,10 +339,10 @@ nla_put_failure: static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; - list_for_each_entry(prog, &head->plist, link) { + list_for_each_entry_rcu(prog, &head->plist, link) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long) prog, arg) < 0) { -- cgit v1.2.3 From 6c555490e0ce885a9caf0a045db69382a3ccbc9c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:09 -0700 Subject: ipv6: drop useless rcu_read_lock() in anycast These code is now protected by rtnl lock, rcu read lock is useless now. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/dev.c | 14 ++++++++------ net/ipv6/anycast.c | 18 ++++++------------ 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ae721f53739e..ee38b948d9a0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2083,8 +2083,8 @@ void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags, - unsigned short mask); +struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, + unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index b3d6dbc0c696..e916ba8caccf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -897,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags + * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * rtnl_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; + ASSERT_RTNL(); + ret = NULL; - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; @@ -921,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags } return ret; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index ff2de7d9d8e6..3b0429bc6b5a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -78,7 +78,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac->acl_addr = *addr; rtnl_lock(); - rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -91,11 +90,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } else { /* router, no matching interface: just pick one */ - dev = dev_get_by_flags_rcu(net, IFF_UP, - IFF_UP | IFF_LOOPBACK); + dev = __dev_get_by_flags(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index_rcu(net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) { err = -ENODEV; @@ -137,7 +136,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } error: - rcu_read_unlock(); rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); @@ -174,11 +172,9 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock_bh(&ipv6_sk_ac_lock); rtnl_lock(); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - rcu_read_unlock(); rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); @@ -203,12 +199,11 @@ void ipv6_sock_ac_close(struct sock *sk) prev_index = 0; rtnl_lock(); - rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -216,7 +211,6 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - rcu_read_unlock(); rtnl_unlock(); } @@ -341,7 +335,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } -/* called with rcu_read_lock() */ +/* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); -- cgit v1.2.3 From b03a9c04a3a605815c232506e1d76281afe0946f Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:10 -0700 Subject: ipv6: remove ipv6_sk_ac_lock Just move rtnl lock up, so that the anycast list can be protected by rtnl lock now. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/anycast.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 3b0429bc6b5a..d10f2e2b49c6 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -46,10 +46,6 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); -/* Big ac list lock for all the sockets */ -static DEFINE_SPINLOCK(ipv6_sk_ac_lock); - - /* * socket join an anycast group */ @@ -128,10 +124,8 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = ipv6_dev_ac_inc(dev, addr); if (!err) { - spin_lock_bh(&ipv6_sk_ac_lock); pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; - spin_unlock_bh(&ipv6_sk_ac_lock); pac = NULL; } @@ -152,7 +146,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); - spin_lock_bh(&ipv6_sk_ac_lock); + rtnl_lock(); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && @@ -161,7 +155,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) prev_pac = pac; } if (!pac) { - spin_unlock_bh(&ipv6_sk_ac_lock); + rtnl_unlock(); return -ENOENT; } if (prev_pac) @@ -169,9 +163,6 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - spin_unlock_bh(&ipv6_sk_ac_lock); - - rtnl_lock(); dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); @@ -192,13 +183,11 @@ void ipv6_sock_ac_close(struct sock *sk) if (!np->ipv6_ac_list) return; - spin_lock_bh(&ipv6_sk_ac_lock); + rtnl_lock(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; - rtnl_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; -- cgit v1.2.3 From 013b4d90387a5dca54281263e0d4650db97bd67c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:11 -0700 Subject: ipv6: clean up ipv6_dev_ac_inc() Make it accept inet6_dev, and rename it to __ipv6_dev_ac_inc() to reflect this change. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/anycast.c | 13 ++++--------- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f679877bb601..9b1d42e66cca 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -202,7 +202,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void ipv6_sock_ac_close(struct sock *sk); -int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr); +int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ad4598fcc416..6b6a373a30b2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1725,7 +1725,7 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; - ipv6_dev_ac_inc(ifp->idev->dev, &addr); + __ipv6_dev_ac_inc(ifp->idev, &addr); } /* caller must hold RTNL */ diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index d10f2e2b49c6..66c19320119a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -122,7 +122,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } - err = ipv6_dev_ac_inc(dev, addr); + err = __ipv6_dev_ac_inc(idev, addr); if (!err) { pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; @@ -215,20 +215,15 @@ static void aca_put(struct ifacaddr6 *ac) /* * device anycast group inc (add if not found) */ -int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) +int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct inet6_dev *idev; struct rt6_info *rt; int err; ASSERT_RTNL(); - idev = in6_dev_get(dev); - - if (idev == NULL) - return -EINVAL; - + in6_dev_hold(idev); write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; @@ -276,7 +271,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) ip6_ins_rt(rt); - addrconf_join_solict(dev, &aca->aca_addr); + addrconf_join_solict(idev->dev, &aca->aca_addr); aca_put(aca); return 0; -- cgit v1.2.3 From 83aa29eefdb152d65e65a90605593766b4f793ef Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:12 -0700 Subject: ipv6: refactor __ipv6_dev_ac_inc() Refactor out allocation and initialization and make the refcount code more readable. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/anycast.c | 62 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 66c19320119a..952c1fd06150 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -203,6 +203,11 @@ void ipv6_sock_ac_close(struct sock *sk) rtnl_unlock(); } +static void aca_get(struct ifacaddr6 *aca) +{ + atomic_inc(&aca->aca_refcnt); +} + static void aca_put(struct ifacaddr6 *ac) { if (atomic_dec_and_test(&ac->aca_refcnt)) { @@ -212,6 +217,29 @@ static void aca_put(struct ifacaddr6 *ac) } } +static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, + const struct in6_addr *addr) +{ + struct inet6_dev *idev = rt->rt6i_idev; + struct ifacaddr6 *aca; + + aca = kzalloc(sizeof(*aca), GFP_ATOMIC); + if (aca == NULL) + return NULL; + + aca->aca_addr = *addr; + in6_dev_hold(idev); + aca->aca_idev = idev; + aca->aca_rt = rt; + aca->aca_users = 1; + /* aca_tstamp should be updated upon changes */ + aca->aca_cstamp = aca->aca_tstamp = jiffies; + atomic_set(&aca->aca_refcnt, 1); + spin_lock_init(&aca->aca_lock); + + return aca; +} + /* * device anycast group inc (add if not found) */ @@ -223,7 +251,6 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) ASSERT_RTNL(); - in6_dev_hold(idev); write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; @@ -238,35 +265,25 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } } - /* - * not found: create a new one. - */ - - aca = kzalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); - - if (aca == NULL) { - err = -ENOMEM; - goto out; - } - rt = addrconf_dst_alloc(idev, addr, true); if (IS_ERR(rt)) { - kfree(aca); err = PTR_ERR(rt); goto out; } - - aca->aca_addr = *addr; - aca->aca_idev = idev; - aca->aca_rt = rt; - aca->aca_users = 1; - /* aca_tstamp should be updated upon changes */ - aca->aca_cstamp = aca->aca_tstamp = jiffies; - atomic_set(&aca->aca_refcnt, 2); - spin_lock_init(&aca->aca_lock); + aca = aca_alloc(rt, addr); + if (aca == NULL) { + ip6_rt_put(rt); + err = -ENOMEM; + goto out; + } aca->aca_next = idev->ac_list; idev->ac_list = aca; + + /* Hold this for addrconf_join_solict() below before we unlock, + * it is already exposed via idev->ac_list. + */ + aca_get(aca); write_unlock_bh(&idev->lock); ip6_ins_rt(rt); @@ -277,7 +294,6 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) return 0; out: write_unlock_bh(&idev->lock); - in6_dev_put(idev); return err; } -- cgit v1.2.3 From b5350916bfd49f737d73c4c512fbea1b3537c703 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:13 -0700 Subject: ipv6: drop ipv6_sk_mc_lock in mcast Similarly the code is already protected by rtnl lock. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 484a94215d88..27ca0b741ccc 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -73,9 +73,6 @@ static void *__mld2_query_bugs[] __attribute__((__unused__)) = { static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; -/* Big mc list lock for all the sockets */ -static DEFINE_SPINLOCK(ipv6_sk_mc_lock); - static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); static void igmp6_timer_handler(unsigned long data); @@ -201,10 +198,8 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return err; } - spin_lock(&ipv6_sk_mc_lock); mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); - spin_unlock(&ipv6_sk_mc_lock); rcu_read_unlock(); rtnl_unlock(); @@ -226,17 +221,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EINVAL; rtnl_lock(); - spin_lock(&ipv6_sk_mc_lock); for (lnk = &np->ipv6_mc_list; - (mc_lst = rcu_dereference_protected(*lnk, - lockdep_is_held(&ipv6_sk_mc_lock))) != NULL; + (mc_lst = rtnl_dereference(*lnk)) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; - spin_unlock(&ipv6_sk_mc_lock); rcu_read_lock(); dev = dev_get_by_index_rcu(net, mc_lst->ifindex); @@ -256,7 +248,6 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } } - spin_unlock(&ipv6_sk_mc_lock); rtnl_unlock(); return -EADDRNOTAVAIL; @@ -303,13 +294,10 @@ void ipv6_sock_mc_close(struct sock *sk) return; rtnl_lock(); - spin_lock(&ipv6_sk_mc_lock); - while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, - lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { + while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; - spin_unlock(&ipv6_sk_mc_lock); rcu_read_lock(); dev = dev_get_by_index_rcu(net, mc_lst->ifindex); @@ -326,9 +314,7 @@ void ipv6_sock_mc_close(struct sock *sk) atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); - spin_lock(&ipv6_sk_mc_lock); } - spin_unlock(&ipv6_sk_mc_lock); rtnl_unlock(); } -- cgit v1.2.3 From 414b6c943fe25c5c576b6f0ce9077c29a150d826 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:14 -0700 Subject: ipv6: drop some rcu_read_lock in mcast Similarly the code is already protected by rtnl lock. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 27ca0b741ccc..4fb761d4002d 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -162,7 +162,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst->addr = *addr; rtnl_lock(); - rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, 0); @@ -171,10 +170,9 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) ip6_rt_put(rt); } } else - dev = dev_get_by_index_rcu(net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) { - rcu_read_unlock(); rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; @@ -192,7 +190,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = ipv6_dev_mc_inc(dev, addr); if (err) { - rcu_read_unlock(); rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; @@ -201,7 +198,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); - rcu_read_unlock(); rtnl_unlock(); return 0; @@ -230,8 +226,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) *lnk = mc_lst->next; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev != NULL) { struct inet6_dev *idev = __in6_dev_get(dev); @@ -240,7 +235,6 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - rcu_read_unlock(); rtnl_unlock(); atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); @@ -299,8 +293,7 @@ void ipv6_sock_mc_close(struct sock *sk) np->ipv6_mc_list = mc_lst->next; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); @@ -309,7 +302,6 @@ void ipv6_sock_mc_close(struct sock *sk) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - rcu_read_unlock(); atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); @@ -934,7 +926,7 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) struct inet6_dev *idev; int err; - rcu_read_lock(); + ASSERT_RTNL(); idev = __in6_dev_get(dev); if (!idev) @@ -942,7 +934,6 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) else err = __ipv6_dev_mc_dec(idev, addr); - rcu_read_unlock(); return err; } -- cgit v1.2.3 From f7ed925c1b4d62e82b72d8e99fa4be52fb0c73b6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:15 -0700 Subject: ipv6: update the comment in mcast.c Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4fb761d4002d..d64e26366cb8 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -556,9 +556,8 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, } err = -EADDRNOTAVAIL; - /* - * changes to the ipv6_mc_list require the socket lock and - * a read lock on ip6_sk_mc_lock. We have the socket lock, + /* changes to the ipv6_mc_list require the socket lock and + * rtnl lock. We have the socket lock and rcu read lock, * so reading the list is safe. */ @@ -582,9 +581,8 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } - /* changes to psl require the socket lock, a read lock on - * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We - * have the socket lock, so reading here is safe. + /* changes to psl require the socket lock, and a write lock + * on pmc->sflock. We have the socket lock so reading here is safe. */ for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; @@ -2350,7 +2348,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; - /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + /* callers have the socket lock and rtnl lock * so no other readers or writers of iml or its sflist */ if (!iml->sflist) { -- cgit v1.2.3 From 1691c63ea42d6f57ba769df401b9773664edb936 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:16 -0700 Subject: ipv6: refactor ipv6_dev_mc_inc() Refactor out allocation and initialization and make the refcount code more readable. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 82 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d64e26366cb8..592eba61e78a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -641,14 +641,6 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } -static void ma_put(struct ifmcaddr6 *mc) -{ - if (atomic_dec_and_test(&mc->mca_refcnt)) { - in6_dev_put(mc->idev); - kfree(mc); - } -} - static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; @@ -814,6 +806,48 @@ static void mld_clear_delrec(struct inet6_dev *idev) read_unlock_bh(&idev->lock); } +static void mca_get(struct ifmcaddr6 *mc) +{ + atomic_inc(&mc->mca_refcnt); +} + +static void ma_put(struct ifmcaddr6 *mc) +{ + if (atomic_dec_and_test(&mc->mca_refcnt)) { + in6_dev_put(mc->idev); + kfree(mc); + } +} + +static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, + const struct in6_addr *addr) +{ + struct ifmcaddr6 *mc; + + mc = kzalloc(sizeof(*mc), GFP_ATOMIC); + if (mc == NULL) + return NULL; + + setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); + + mc->mca_addr = *addr; + mc->idev = idev; /* reference taken by caller */ + mc->mca_users = 1; + /* mca_stamp should be updated upon changes */ + mc->mca_cstamp = mc->mca_tstamp = jiffies; + atomic_set(&mc->mca_refcnt, 1); + spin_lock_init(&mc->mca_lock); + + /* initial mode is (EX, empty) */ + mc->mca_sfmode = MCAST_EXCLUDE; + mc->mca_sfcount[MCAST_EXCLUDE] = 1; + + if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || + IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + mc->mca_flags |= MAF_NOREPORT; + + return mc; +} /* * device multicast group inc (add if not found) @@ -849,38 +883,20 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) } } - /* - * not found: create a new one. - */ - - mc = kzalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); - - if (mc == NULL) { + mc = mca_alloc(idev, addr); + if (!mc) { write_unlock_bh(&idev->lock); in6_dev_put(idev); return -ENOMEM; } - setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); - - mc->mca_addr = *addr; - mc->idev = idev; /* (reference taken) */ - mc->mca_users = 1; - /* mca_stamp should be updated upon changes */ - mc->mca_cstamp = mc->mca_tstamp = jiffies; - atomic_set(&mc->mca_refcnt, 2); - spin_lock_init(&mc->mca_lock); - - /* initial mode is (EX, empty) */ - mc->mca_sfmode = MCAST_EXCLUDE; - mc->mca_sfcount[MCAST_EXCLUDE] = 1; - - if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || - IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) - mc->mca_flags |= MAF_NOREPORT; - mc->next = idev->mc_list; idev->mc_list = mc; + + /* Hold this for the code below before we unlock, + * it is already exposed via idev->mc_list. + */ + mca_get(mc); write_unlock_bh(&idev->lock); mld_del_delrec(idev, &mc->mca_addr); -- cgit v1.2.3 From 3ce62a84d53cd3d3cc5377bbf339e9b08ddf9c36 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:07:16 -0700 Subject: ipv6: exit early in addrconf_notify() if IPv6 is disabled If IPv6 is explicitly disabled before the interface comes up, it makes no sense to continue when it comes up, even just print a message. (I am not sure about other cases though, so I prefer not to touch) Signed-off-by: Cong Wang Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6b6a373a30b2..39d33355d7e8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2844,6 +2844,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (dev->flags & IFF_SLAVE) break; + if (idev && idev->cnf.disable_ipv6) + break; + if (event == NETDEV_UP) { if (!addrconf_qdisc_ok(dev)) { /* device is not ready yet. */ -- cgit v1.2.3 From ac7a04c33dd7f8e429df4b929ba3a3e8e729cc89 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 11 Sep 2014 21:18:09 -0700 Subject: net: dsa: change tag_protocol to an enum Now that we introduced an additional multiplexing/demultiplexing layer with commit 3e8a72d1dae37 ("net: dsa: reduce number of protocol hooks") that lives within the DSA code, we no longer need to have a given switch driver tag_protocol be an actual ethertype value, instead, we can replace it with an enum: dsa_tag_protocol. Do this replacement in the drivers, which allows us to get rid of the cpu_to_be16()/htons() dance, and remove ETH_P_BRCMTAG since we do not need it anymore. Suggested-by: Alexander Duyck Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 2 +- drivers/net/dsa/mv88e6060.c | 2 +- drivers/net/dsa/mv88e6123_61_65.c | 4 ++-- drivers/net/dsa/mv88e6131.c | 2 +- include/net/dsa.h | 17 ++++++++++------- net/dsa/slave.c | 8 ++++---- net/dsa/tag_brcm.c | 1 - 7 files changed, 19 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index bb7cb8e283b1..e9918c7f1792 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -592,7 +592,7 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port, } static struct dsa_switch_driver bcm_sf2_switch_driver = { - .tag_protocol = htons(ETH_P_BRCMTAG), + .tag_protocol = DSA_TAG_PROTO_BRCM, .priv_size = sizeof(struct bcm_sf2_priv), .probe = bcm_sf2_sw_probe, .setup = bcm_sf2_sw_setup, diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 7a54ec04b418..d8037c1055ac 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -258,7 +258,7 @@ static void mv88e6060_poll_link(struct dsa_switch *ds) } static struct dsa_switch_driver mv88e6060_switch_driver = { - .tag_protocol = htons(ETH_P_TRAILER), + .tag_protocol = DSA_TAG_PROTO_TRAILER, .probe = mv88e6060_probe, .setup = mv88e6060_setup, .set_addr = mv88e6060_set_addr, diff --git a/drivers/net/dsa/mv88e6123_61_65.c b/drivers/net/dsa/mv88e6123_61_65.c index 69c42513dd72..975774f338c2 100644 --- a/drivers/net/dsa/mv88e6123_61_65.c +++ b/drivers/net/dsa/mv88e6123_61_65.c @@ -207,7 +207,7 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) */ val = 0x0433; if (dsa_is_cpu_port(ds, p)) { - if (ds->dst->tag_protocol == htons(ETH_P_EDSA)) + if (ds->dst->tag_protocol == DSA_TAG_PROTO_EDSA) val |= 0x3300; else val |= 0x0100; @@ -391,7 +391,7 @@ static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds) } struct dsa_switch_driver mv88e6123_61_65_switch_driver = { - .tag_protocol = cpu_to_be16(ETH_P_EDSA), + .tag_protocol = DSA_TAG_PROTO_EDSA, .priv_size = sizeof(struct mv88e6xxx_priv_state), .probe = mv88e6123_61_65_probe, .setup = mv88e6123_61_65_setup, diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c index 953bc6a49e59..35541f2ceca3 100644 --- a/drivers/net/dsa/mv88e6131.c +++ b/drivers/net/dsa/mv88e6131.c @@ -379,7 +379,7 @@ static int mv88e6131_get_sset_count(struct dsa_switch *ds) } struct dsa_switch_driver mv88e6131_switch_driver = { - .tag_protocol = cpu_to_be16(ETH_P_DSA), + .tag_protocol = DSA_TAG_PROTO_DSA, .priv_size = sizeof(struct mv88e6xxx_priv_state), .probe = mv88e6131_probe, .setup = mv88e6131_setup, diff --git a/include/net/dsa.h b/include/net/dsa.h index 97712927a9d2..8a8a5d976f97 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -19,10 +19,13 @@ #include #include -/* Not an official ethertype value, used only internally for DSA - * demultiplexing - */ -#define ETH_P_BRCMTAG (ETH_P_XDSA + 1) +enum dsa_tag_protocol { + DSA_TAG_PROTO_NONE = 0, + DSA_TAG_PROTO_DSA, + DSA_TAG_PROTO_TRAILER, + DSA_TAG_PROTO_EDSA, + DSA_TAG_PROTO_BRCM, +}; #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 @@ -89,7 +92,7 @@ struct dsa_switch_tree { */ struct net_device *master_netdev; const struct dsa_device_ops *ops; - __be16 tag_protocol; + enum dsa_tag_protocol tag_protocol; /* * The switch and port to which the CPU is attached. @@ -166,7 +169,7 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) struct dsa_switch_driver { struct list_head list; - __be16 tag_protocol; + enum dsa_tag_protocol tag_protocol; int priv_size; /* @@ -215,7 +218,7 @@ static inline void *ds_to_priv(struct dsa_switch *ds) static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { - return dst->tag_protocol != 0; + return dst->tag_protocol != DSA_TAG_PROTO_NONE; } #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7333a4aebb7d..809eeb13eb12 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -437,22 +437,22 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA - case htons(ETH_P_DSA): + case DSA_TAG_PROTO_DSA: ds->dst->ops = &dsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA - case htons(ETH_P_EDSA): + case DSA_TAG_PROTO_EDSA: ds->dst->ops = &edsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER - case htons(ETH_P_TRAILER): + case DSA_TAG_PROTO_TRAILER: ds->dst->ops = &trailer_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_BRCM - case htons(ETH_P_BRCMTAG): + case DSA_TAG_PROTO_BRCM: ds->dst->ops = &brcm_netdev_ops; break; #endif diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e0b759ec209e..8fbc21c0de78 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -91,7 +91,6 @@ static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ - skb->protocol = htons(ETH_P_BRCMTAG); skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); -- cgit v1.2.3 From 233577a22089facf5271ab5e845b2262047c971f Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 12 Sep 2014 14:04:43 +0200 Subject: net: filter: constify detection of pkt_type_offset Currently we have 2 pkt_type_offset functions doing the same thing and spread across the architecture files. Remove those and replace them with a PKT_TYPE_OFFSET macro helper which gets the constant value from a zero sized sk_buff member right in front of the bitfield with offsetof. This new offset marker does not change size of struct sk_buff. Cc: Eric Dumazet Cc: Markos Chandras Cc: Martin Schwidefsky Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Denis Kirjanov Signed-off-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/mips/net/bpf_jit.c | 27 +-------------------------- arch/s390/net/bpf_jit_comp.c | 35 +---------------------------------- include/linux/skbuff.h | 10 ++++++++++ net/core/filter.c | 31 ++----------------------------- 4 files changed, 14 insertions(+), 89 deletions(-) (limited to 'net') diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0e97ccd29fe3..7edc08398c4a 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -765,27 +765,6 @@ static u64 jit_get_skb_w(struct sk_buff *skb, unsigned offset) return (u64)err << 32 | ntohl(ret); } -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - u8 *ct = (u8 *)&skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n"); - return -1; -} - static int build_body(struct jit_ctx *ctx) { void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; @@ -1332,11 +1311,7 @@ jmp_cmp: case BPF_ANC | SKF_AD_PKTTYPE: ctx->flags |= SEEN_SKB; - off = pkt_type_offset(); - - if (off < 0) - return -1; - emit_load_byte(r_tmp, r_skb, off, ctx); + emit_load_byte(r_tmp, r_skb, PKT_TYPE_OFFSET(), ctx); /* Keep only the last 3 bits */ emit_andi(r_A, r_tmp, PKT_TYPE_MAX, ctx); #ifdef __BIG_ENDIAN_BITFIELD diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 555f5c7e83ab..c52ac77408ca 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -227,37 +227,6 @@ static void bpf_jit_epilogue(struct bpf_jit *jit) EMIT2(0x07fe); } -/* Helper to find the offset of pkt_type in sk_buff - * Make sure its still a 3bit field starting at the MSBs within a byte. - */ -#define PKT_TYPE_MAX 0xe0 -static int pkt_type_offset; - -static int __init bpf_pkt_type_offset_init(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - char *ct = (char *)&skb_probe; - int off; - - pkt_type_offset = -1; - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (!ct[off]) - continue; - if (ct[off] == PKT_TYPE_MAX) - pkt_type_offset = off; - else { - /* Found non matching bit pattern, fix needed. */ - WARN_ON_ONCE(1); - pkt_type_offset = -1; - return -1; - } - } - return 0; -} -device_initcall(bpf_pkt_type_offset_init); - /* * make sure we dont leak kernel information to user */ @@ -757,12 +726,10 @@ call_fn: /* lg %r1,(%r13) */ } break; case BPF_ANC | SKF_AD_PKTTYPE: - if (pkt_type_offset < 0) - goto out; /* lhi %r5,0 */ EMIT4(0xa7580000); /* ic %r5,(%r2) */ - EMIT4_DISP(0x43502000, pkt_type_offset); + EMIT4_DISP(0x43502000, PKT_TYPE_OFFSET()); /* srl %r5,5 */ EMIT4_DISP(0x88500000, 5); break; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..756e3d057e84 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -548,6 +548,16 @@ struct sk_buff { ip_summed:2, nohdr:1, nfctinfo:3; + +/* if you move pkt_type around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 +#endif +#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) + + __u8 __pkt_type_offset[0]; __u8 pkt_type:3, fclone:2, ipvs_property:1, diff --git a/net/core/filter.c b/net/core/filter.c index dfc716ffa44b..601f28de7311 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,30 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Helper to find the offset of pkt_type in sk_buff structure. We want - * to make sure its still a 3bit field starting at a byte boundary; - * taken from arch/x86/net/bpf_jit_comp.c. - */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static unsigned int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { .pkt_type = ~0, }; - u8 *ct = (u8 *) &skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - - pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); - return -1; -} - static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { return skb_get_poff((struct sk_buff *)(unsigned long) ctx); @@ -190,11 +166,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - pkt_type_offset()); - if (insn->off < 0) - return false; - insn++; + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + PKT_TYPE_OFFSET()); *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD insn++; -- cgit v1.2.3 From c0d1379a19c3dde3c32be50164997d246241c1e4 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 13 Sep 2014 00:06:30 -0400 Subject: net: bpf: correctly handle errors in sk_attach_filter() Commit "net: bpf: make eBPF interpreter images read-only" has changed bpf_prog to be vmalloc()ed but never handled some of the errors paths of the old code. On error within sk_attach_filter (which userspace can easily trigger), we'd kfree() the vmalloc()ed memory, and leak the internal bpf_work_struct. Signed-off-by: Sasha Levin Acked-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 601f28de7311..fcd3f6742a6a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1047,7 +1047,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return -ENOMEM; if (copy_from_user(prog->insns, fprog->filter, fsize)) { - kfree(prog); + __bpf_prog_free(prog); return -EFAULT; } @@ -1055,7 +1055,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) err = bpf_prog_store_orig_filter(prog, fprog); if (err) { - kfree(prog); + __bpf_prog_free(prog); return -ENOMEM; } -- cgit v1.2.3 From 7cb9d20fd9f8fb41f29e294734c4f8b5dc81ed93 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 14 Sep 2014 22:50:46 +0200 Subject: Bluetooth: Add BUILD_BUG_ON check for SKB control buffer size The struct bt_skb_cb size needs to stay within the limits of skb->cb at all times and to ensure that add a BUILD_BUG_ON to check for it at compile time. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/af_bluetooth.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 4dca0299ed96..339c74ad4553 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -709,8 +709,11 @@ EXPORT_SYMBOL_GPL(bt_debugfs); static int __init bt_init(void) { + struct sk_buff *skb; int err; + BUILD_BUG_ON(sizeof(struct bt_skb_cb) > sizeof(skb->cb)); + BT_INFO("Core ver %s", VERSION); bt_debugfs = debugfs_create_dir("bluetooth", NULL); -- cgit v1.2.3 From 43e73e4e2ad05d9bf3b438cfbe1e71b57a85f26c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 14 Sep 2014 23:06:28 +0200 Subject: Bluetooth: Provide HCI command opcode information to driver The Bluetooth core already does processing of the HCI command header and puts it together before sending it to the driver. It is not really efficient for the driver to look at the HCI command header again in case it has to make certain decisions about certain commands. To make this easier, just provide the opcode as part of the SKB control buffer information. The extra information about the opcode is optional and only provided for HCI commands. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/hci_core.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 373000de610d..7e666d06b97f 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -284,6 +284,7 @@ struct hci_req_ctrl { struct bt_skb_cb { __u8 pkt_type; __u8 incoming; + __u16 opcode; __u16 expect; __u8 force_active; struct l2cap_chan *chan; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 067526d9680d..41948678f514 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4547,6 +4547,7 @@ static struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, BT_DBG("skb len %d", skb->len); bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->opcode = opcode; return skb; } -- cgit v1.2.3 From e11ecddf5128011c936cc5360780190cbc901fdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Sep 2014 04:19:51 -0700 Subject: tcp: use TCP_SKB_CB(skb)->tcp_flags in input path Input path of TCP do not currently uses TCP_SKB_CB(skb)->tcp_flags, which is only used in output path. tcp_recvmsg(), looks at tcp_hdr(skb)->syn for every skb found in receive queue, and its unfortunate because this bit is located in a cache line right before the payload. We can simplify TCP by copying tcp flags into TCP_SKB_CB(skb)->tcp_flags. This patch does so, and avoids the cache line miss in tcp_recvmsg() Following patches will - allow a segment with FIN being coalesced in tcp_try_coalesce() - simplify tcp_collapse() by not copying the headers. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 18 ++++++++++-------- net/ipv4/tcp_input.c | 10 +++++----- net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 541f26a67ba2..070aeff1b131 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1510,9 +1510,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; - if (offset < skb->len || tcp_hdr(skb)->fin) { + if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } @@ -1585,7 +1585,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (offset + 1 != skb->len) continue; } - if (tcp_hdr(skb)->fin) { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { sk_eat_skb(sk, skb, false); ++seq; break; @@ -1722,11 +1722,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; if (offset < skb->len) goto found_ok_skb; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", @@ -1959,7 +1959,7 @@ skip_copy: if (used + offset < skb->len) continue; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); @@ -2160,8 +2160,10 @@ void tcp_close(struct sock *sk, long timeout) * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - tcp_hdr(skb)->fin; + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; data_was_unread += len; __kfree_skb(skb); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f97003ad0af5..8f639a4face9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4093,7 +4093,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); } } @@ -4513,7 +4513,7 @@ restart: * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && + if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; @@ -4532,7 +4532,8 @@ restart: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } - if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; while (before(start, end)) { @@ -4579,8 +4580,7 @@ restart: skb = tcp_collapse_one(sk, skb, list); if (!skb || skb == tail || - tcp_hdr(skb)->syn || - tcp_hdr(skb)->fin) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7881b96d2b72..006b045716d8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1638,6 +1638,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1835480336ac..de51a88bec6f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1415,6 +1415,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; -- cgit v1.2.3 From e93a0435f809d009919a743fb6e93076faac8aa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Sep 2014 04:19:52 -0700 Subject: tcp: allow segment with FIN in tcp_try_coalesce() We can allow a segment with FIN to be aggregated, if we take care to add tcp flags, and if skb_try_coalesce() takes care of zero sized skbs. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- net/ipv4/tcp_input.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c8259ac38745..29f7f0121491 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3936,7 +3936,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8f639a4face9..228bf0c5ff19 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4143,9 +4143,6 @@ static bool tcp_try_coalesce(struct sock *sk, *fragstolen = false; - if (tcp_hdr(from)->fin) - return false; - /* Its possible this segment overlaps with prior segment in queue */ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; @@ -4158,6 +4155,7 @@ static bool tcp_try_coalesce(struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; return true; } -- cgit v1.2.3 From b3d6cb92fd190d720a01075c4d20cdca896663fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Sep 2014 04:19:53 -0700 Subject: tcp: do not copy headers in tcp_collapse() tcp_collapse() wants to shrink skb so that the overhead is minimal. Now we store tcp flags into TCP_SKB_CB(skb)->tcp_flags, we no longer need to keep around full headers. Whole available space is dedicated to the payload. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 228bf0c5ff19..ea92f23ffaf1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4535,26 +4535,13 @@ restart: return; while (before(start, end)) { + int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; - unsigned int header = skb_headroom(skb); - int copy = SKB_MAX_ORDER(header, 0); - /* Too big header? This can happen with IPv6. */ - if (copy < 0) - return; - if (end - start < copy) - copy = end - start; - nskb = alloc_skb(copy + header, GFP_ATOMIC); + nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) return; - skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - skb_set_network_header(nskb, (skb_network_header(skb) - - skb->head)); - skb_set_transport_header(nskb, (skb_transport_header(skb) - - skb->head)); - skb_reserve(nskb, header); - memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_queue_before(list, skb, nskb); -- cgit v1.2.3 From 73e64e1813e9ea45885419d0fff1e628a6ab95d4 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 20:48:26 +0200 Subject: netfilter: ipset: Fix static checker warning in ip_set_core.c Dan Carpenter reported the following static checker warning: net/netfilter/ipset/ip_set_core.c:1414 call_ad() error: 'nlh->nlmsg_len' from user is not capped properly The payload size is limited now by the max size of size_t. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 5593e97426c4..4ca4e5ca6f57 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1397,7 +1397,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; struct nlmsgerr *errmsg; - size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); + size_t payload = min(SIZE_MAX, + sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *cmdattr; -- cgit v1.2.3 From 0e9871e3f79fd17c691b50a9669220c54ff084a2 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:27 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support in the ipset core. Skbinfo extension provides mapping of metainformation with lookup in the ipset tables. This patch defines the flags, the constants, the functions and the structures for the data type independent support of the extension. Note the firewall mark stores in the kernel structures as two 32bit values, but transfered through netlink as one 64bit value. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 56 ++++++++++++++++++++++++++++- include/uapi/linux/netfilter/ipset/ip_set.h | 12 +++++++ net/netfilter/ipset/ip_set_core.c | 27 +++++++++++++- 3 files changed, 93 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 96afc29184be..b97aac5142ed 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -57,6 +57,8 @@ enum ip_set_extension { IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), + IPSET_EXT_BIT_SKBINFO = 3, + IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), @@ -65,12 +67,14 @@ enum ip_set_extension { #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) +#define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, + IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; @@ -92,6 +96,10 @@ struct ip_set_ext { u64 packets; u64 bytes; u32 timeout; + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; char *comment; }; @@ -104,6 +112,13 @@ struct ip_set_comment { char *str; }; +struct ip_set_skbinfo { + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; +}; + struct ip_set; #define ext_timeout(e, s) \ @@ -112,7 +127,8 @@ struct ip_set; (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]) #define ext_comment(e, s) \ (struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]) - +#define ext_skbinfo(e, s) \ +(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -256,6 +272,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; @@ -304,6 +322,39 @@ ip_set_update_counter(struct ip_set_counter *counter, } } +static inline void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbmark = skbinfo->skbmark; + mext->skbmarkmask = skbinfo->skbmarkmask; + mext->skbprio = skbinfo->skbprio; + mext->skbqueue = skbinfo->skbqueue; +} +static inline bool +ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) +{ + return nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask)) || + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio)) || + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue)); + +} + +static inline void +ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext) +{ + skbinfo->skbmark = ext->skbmark; + skbinfo->skbmarkmask = ext->skbmarkmask; + skbinfo->skbprio = ext->skbprio; + skbinfo->skbqueue = ext->skbqueue; +} + static inline bool ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter) { @@ -497,6 +548,9 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 78c2f2e79920..ca03119111a2 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -115,6 +115,9 @@ enum { IPSET_ATTR_BYTES, IPSET_ATTR_PACKETS, IPSET_ATTR_COMMENT, + IPSET_ATTR_SKBMARK, + IPSET_ATTR_SKBPRIO, + IPSET_ATTR_SKBQUEUE, __IPSET_ATTR_ADT_MAX, }; #define IPSET_ATTR_ADT_MAX (__IPSET_ATTR_ADT_MAX - 1) @@ -147,6 +150,7 @@ enum ipset_errno { IPSET_ERR_COUNTER, IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, + IPSET_ERR_SKBINFO, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, @@ -170,6 +174,12 @@ enum ipset_cmd_flags { IPSET_FLAG_MATCH_COUNTERS = (1 << IPSET_FLAG_BIT_MATCH_COUNTERS), IPSET_FLAG_BIT_RETURN_NOMATCH = 7, IPSET_FLAG_RETURN_NOMATCH = (1 << IPSET_FLAG_BIT_RETURN_NOMATCH), + IPSET_FLAG_BIT_MAP_SKBMARK = 8, + IPSET_FLAG_MAP_SKBMARK = (1 << IPSET_FLAG_BIT_MAP_SKBMARK), + IPSET_FLAG_BIT_MAP_SKBPRIO = 9, + IPSET_FLAG_MAP_SKBPRIO = (1 << IPSET_FLAG_BIT_MAP_SKBPRIO), + IPSET_FLAG_BIT_MAP_SKBQUEUE = 10, + IPSET_FLAG_MAP_SKBQUEUE = (1 << IPSET_FLAG_BIT_MAP_SKBQUEUE), IPSET_FLAG_CMD_MAX = 15, }; @@ -187,6 +197,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), IPSET_FLAG_BIT_WITH_FORCEADD = 5, IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), + IPSET_FLAG_BIT_WITH_SKBINFO = 6, + IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), IPSET_FLAG_CADT_MAX = 15, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4ca4e5ca6f57..26c795e6b57f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -337,6 +337,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [IPSET_EXT_ID_SKBINFO] = { + .type = IPSET_EXT_SKBINFO, + .flag = IPSET_FLAG_WITH_SKBINFO, + .len = sizeof(struct ip_set_skbinfo), + .align = __alignof__(struct ip_set_skbinfo), + }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, @@ -382,6 +388,7 @@ int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { + u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { if (!(set->extensions & IPSET_EXT_TIMEOUT)) return -IPSET_ERR_TIMEOUT; @@ -402,7 +409,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } - + if (tb[IPSET_ATTR_SKBMARK]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); + ext->skbmark = fullmark >> 32; + ext->skbmarkmask = fullmark & 0xffffffff; + } + if (tb[IPSET_ATTR_SKBPRIO]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbprio = be32_to_cpu(nla_get_be32( + tb[IPSET_ATTR_SKBPRIO])); + } + if (tb[IPSET_ATTR_SKBQUEUE]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbqueue = be16_to_cpu(nla_get_be16( + tb[IPSET_ATTR_SKBQUEUE])); + } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -- cgit v1.2.3 From 39d1ecf1ad0e19145e1f3a6cd838b7354ef71bf7 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:28 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support for the bitmap set types. Add skbinfo extension kernel support for the bitmap set types. Inroduce the new revisions of bitmap_ip, bitmap_ipmac and bitmap_port set types. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_bitmap_gen.h | 4 ++++ net/netfilter/ipset/ip_set_bitmap_ip.c | 11 +++++++++-- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 11 +++++++++-- net/netfilter/ipset/ip_set_bitmap_port.c | 11 +++++++++-- 4 files changed, 31 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index f2c7d83dc23f..6f024a8a1534 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -128,6 +128,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, return 0; if (SET_WITH_COUNTER(set)) ip_set_update_counter(ext_counter(x, set), ext, mext, flags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(x, set), ext, mext, flags); return 1; } @@ -161,6 +163,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(x, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(x, set), ext); return 0; } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index dafdb39ef042..55b083ec587a 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -139,7 +140,10 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -357,6 +361,9 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index dbad505e79e3..86104744b00f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -240,7 +241,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -394,6 +398,9 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index a4b65ae1986c..005dd36444c3 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -22,7 +22,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -139,7 +140,10 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -291,6 +295,9 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; -- cgit v1.2.3 From af331419d34e2fc0e2d0c629734f8d160f95a3ec Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:29 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support for the hash set types. Add skbinfo extension kernel support for the hash set types. Inroduce the new revisions of all hash set types. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 5 +++++ net/netfilter/ipset/ip_set_hash_ip.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipmark.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipport.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipportip.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_ipportnet.c | 14 ++++++++++++-- net/netfilter/ipset/ip_set_hash_net.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netiface.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netnet.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netport.c | 16 +++++++++++++--- net/netfilter/ipset/ip_set_hash_netportnet.c | 16 +++++++++++++--- 11 files changed, 130 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 8a38890cbe5e..ac3a268287d9 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -720,6 +720,8 @@ reuse_slot: ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(data, set), ext); out: rcu_read_unlock_bh(); @@ -797,6 +799,9 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, if (SET_WITH_COUNTER(set)) ip_set_update_counter(ext_counter(data, set), ext, mext, flags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(data, set), + ext, mext, flags); return mtype_do_data_match(data); } diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e52739938533..76959d79e9d1 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -26,7 +26,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ /* 2 Comments support */ -#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ +/* 3 Forceadd support */ +#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -111,7 +112,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -247,6 +251,9 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -295,6 +302,9 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 4eff0a297254..7abf9788cfa8 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -25,7 +25,8 @@ #include #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ +/* 1 Forceadd support */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa "); @@ -113,7 +114,10 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -244,6 +248,9 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -301,6 +308,9 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index f37a5ae8a5e0..dcbcceb9a52f 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -28,7 +28,8 @@ /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ -#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ +/* 4 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -122,7 +123,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -287,6 +291,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -370,6 +377,9 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 41ef00eda874..7ef93fc887a1 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -28,7 +28,8 @@ /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ -#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ +/* 4 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -124,7 +125,10 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -295,6 +299,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -382,6 +389,9 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7308d84f9277..b6012ad92781 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -30,7 +30,8 @@ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ -#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ +/* 6 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -179,7 +180,10 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -432,6 +436,9 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -541,6 +548,9 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 4c7d495783a3..6b3ac10ac2f1 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -27,7 +27,8 @@ /* 2 nomatch flag support added */ /* 3 Counters support added */ /* 4 Comments support added */ -#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ +/* 5 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -150,7 +151,10 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -318,7 +322,10 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -377,6 +384,9 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index db2606805b35..03cdb69ac9bf 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -28,7 +28,8 @@ /* 2 /0 support added */ /* 3 Counters support added */ /* 4 Comments support added */ -#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ +/* 5 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -281,7 +282,10 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -514,7 +518,10 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -590,6 +597,9 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 96b131366e7b..da00284b3571 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -24,7 +24,8 @@ #include #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ +/* 1 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith "); @@ -171,7 +172,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -394,7 +398,10 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -462,6 +469,9 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 1c645fbd09c7..c0ddb58d19dc 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -29,7 +29,8 @@ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ -#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ +/* 6 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -172,7 +173,10 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -389,7 +393,10 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -489,6 +496,9 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 2f0034347189..b8053d675fc3 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -26,7 +26,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ +/* 1 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith "); @@ -189,7 +190,10 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -460,7 +464,10 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -569,6 +576,9 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; -- cgit v1.2.3 From cbee93d7b71bf9d73382e503a4f60848eec60ea8 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:30 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support for the list set type. Add skbinfo extension kernel support for the list set type. Introduce the new revision of the list set type. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_list_set.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index f87adbad6076..f8f682806e36 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -17,7 +17,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comments support added */ +/* 2 Comments support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -73,6 +74,10 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, ip_set_update_counter(ext_counter(e, set), ext, &opt->ext, cmdflags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(e, set), + ext, &opt->ext, + cmdflags); return ret; } } @@ -197,6 +202,8 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); return 0; } @@ -307,6 +314,8 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; @@ -378,7 +387,10 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -667,6 +679,9 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; -- cgit v1.2.3 From 76cea4109ca89dea218fdc652d2e1535fd9b5fc7 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Tue, 2 Sep 2014 14:21:20 +0400 Subject: netfilter: ipset: Add skbinfo extension support to SET target. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/xt_set.h | 10 +++ net/netfilter/xt_set.c | 155 ++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h index 964d3d42f874..d6a1df1f2947 100644 --- a/include/uapi/linux/netfilter/xt_set.h +++ b/include/uapi/linux/netfilter/xt_set.h @@ -71,4 +71,14 @@ struct xt_set_info_match_v3 { __u32 flags; }; +/* Revision 3 target */ + +struct xt_set_info_target_v3 { + struct xt_set_info add_set; + struct xt_set_info del_set; + struct xt_set_info map_set; + __u32 flags; + __u32 timeout; +}; + #endif /*_XT_SET_H*/ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index cb70f6ec5695..5732cd64acc0 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -366,6 +366,140 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 target */ + +static unsigned int +set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + ADT_OPT(map_opt, par->family, info->map_set.dim, + info->map_set.flags, 0, UINT_MAX); + + int ret; + + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + if (info->map_set.index != IPSET_INVALID_ID) { + map_opt.cmdflags |= info->flags & (IPSET_FLAG_MAP_SKBMARK | + IPSET_FLAG_MAP_SKBPRIO | + IPSET_FLAG_MAP_SKBQUEUE); + ret = match_set(info->map_set.index, skb, par, &map_opt, + info->map_set.flags & IPSET_INV_MATCH); + if (!ret) + return XT_CONTINUE; + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK) + skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask)) + ^ (map_opt.ext.skbmark); + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO) + skb->priority = map_opt.ext.skbprio; + if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) && + skb->dev && + skb->dev->real_num_tx_queues > map_opt.ext.skbqueue) + skb_set_queue_mapping(skb, map_opt.ext.skbqueue); + } + return XT_CONTINUE; +} + + +static int +set_target_v3_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + return -ENOENT; + } + } + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_warn("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + !(par->hook_mask & (1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_warn("mapping of prio or/and queue is allowed only" + "from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + index = ip_set_nfnl_get_byindex(par->net, + info->map_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find map_set index %u as target\n", + info->map_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->del_set.index); + return -ENOENT; + } + } + + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX || + info->map_set.dim > IPSET_DIM_MAX) { + pr_warn("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v3_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +} + + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -493,6 +627,27 @@ static struct xt_target set_targets[] __read_mostly = { .destroy = set_target_v2_destroy, .me = THIS_MODULE }, + /* --map-set support */ + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV4, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV6, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, }; static int __init xt_set_init(void) -- cgit v1.2.3 From 07034aeae152de52c29f032ca995bf9dafbe24e2 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 17:36:06 +0200 Subject: netfilter: ipset: hash:mac type added to ipset Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/Kconfig | 9 ++ net/netfilter/ipset/Makefile | 1 + net/netfilter/ipset/ip_set_hash_gen.h | 11 ++- net/netfilter/ipset/ip_set_hash_mac.c | 173 ++++++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/ipset/ip_set_hash_mac.c (limited to 'net') diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 2f7f5c32c6f9..234a8ec82076 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -99,6 +99,15 @@ config IP_SET_HASH_IPPORTNET To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_MAC + tristate "hash:mac set support" + depends on IP_SET + help + This option adds the hash:mac set type support, by which + one can store MAC (ethernet address) elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_NETPORTNET tristate "hash:net,port,net set support" depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 231f10196cb9..3dbd5e958489 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_MAC) += ip_set_hash_mac.o obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index ac3a268287d9..fee7c64e4dd1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1054,8 +1054,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, struct HTYPE *h; struct htable *t; +#ifndef IP_SET_PROTO_UNDEF if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; +#endif #ifdef IP_SET_HASH_WITH_MARKMASK markmask = 0xffffffff; @@ -1137,25 +1139,32 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, rcu_assign_pointer(h->table, t); set->data = h; +#ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { +#endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); +#ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); } +#endif if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +#ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) +#endif IPSET_TOKEN(HTYPE, 4_gc_init)(set, IPSET_TOKEN(HTYPE, 4_gc)); +#ifndef IP_SET_PROTO_UNDEF else IPSET_TOKEN(HTYPE, 6_gc_init)(set, IPSET_TOKEN(HTYPE, 6_gc)); +#endif } - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", set->name, jhash_size(t->htable_bits), t->htable_bits, h->maxelem, set->data, t); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c new file mode 100644 index 000000000000..65690b52a4d5 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -0,0 +1,173 @@ +/* Copyright (C) 2014 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:mac type */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:mac"); + +/* Type specific function prefix */ +#define HTYPE hash_mac + +/* Member elements */ +struct hash_mac4_elem { + /* Zero valued IP addresses cannot be stored */ + union { + unsigned char ether[ETH_ALEN]; + __be32 foo[2]; + }; +}; + +/* Common functions */ + +static inline bool +hash_mac4_data_equal(const struct hash_mac4_elem *e1, + const struct hash_mac4_elem *e2, + u32 *multi) +{ + return ether_addr_equal(e1->ether, e2->ether); +} + +static inline bool +hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) +{ + return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); +} + +static inline void +hash_mac4_data_next(struct hash_mac4_elem *next, + const struct hash_mac4_elem *e) +{ +} + +#define MTYPE hash_mac4 +#define PF 4 +#define HOST_MASK 32 +#define IP_SET_EMIT_CREATE +#define IP_SET_PROTO_UNDEF +#include "ip_set_hash_gen.h" + +/* Zero valued element is not supported */ +static const unsigned char invalid_ether[ETH_ALEN] = { 0 }; + +static int +hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + /* MAC can be src only */ + if (!(opt->flags & IPSET_DIM_ONE_SRC)) + return 0; + + if (skb_mac_header(skb) < skb->head || + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_ETHER] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + return -IPSET_ERR_HASH_ELEM; + + return adtfn(set, &e, &ext, &ext, flags); +} + +static struct ip_set_type hash_mac_type __read_mostly = { + .name = "hash:mac", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_MAC, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_mac_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_mac_init(void) +{ + return ip_set_type_register(&hash_mac_type); +} + +static void __exit +hash_mac_fini(void) +{ + ip_set_type_unregister(&hash_mac_type); +} + +module_init(hash_mac_init); +module_exit(hash_mac_fini); -- cgit v1.2.3 From 5075314e4e4b559cc37675ad8a721a89bccd6284 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 15 Sep 2014 13:00:19 -0400 Subject: dsa: Split ops up, and avoid assigning tag_protocol and receive separately This change addresses several issues. First, it was possible to set tag_protocol without setting the ops pointer. To correct that I have reordered things so that rcv is now populated before we set tag_protocol. Second, it didn't make much sense to keep setting the device ops each time a new slave was registered. So by moving the receive portion out into root switch initialization that issue should be addressed. Third, I wanted to avoid sending tags if the rcv pointer was not registered so I changed the tag check to verify if the rcv function pointer is set on the root tree. If it is then we start sending DSA tagged frames. Finally I split the device ops pointer in the structures into two spots. I placed the rcv function pointer in the root switch since this makes it easiest to access from there, and I placed the xmit function pointer in the slave for the same reason. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ------- include/net/dsa.h | 10 ++++++---- net/dsa/dsa.c | 32 ++++++++++++++++++++++++++++---- net/dsa/dsa_priv.h | 11 ++++++++++- net/dsa/slave.c | 37 +++++++++++++++---------------------- net/dsa/tag_brcm.c | 1 - net/dsa/tag_dsa.c | 1 - net/dsa/tag_edsa.c | 1 - net/dsa/tag_trailer.c | 1 - 9 files changed, 59 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f9e81d10a3b9..28d4378615e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1928,13 +1928,6 @@ struct udp_offload { struct offload_callbacks callbacks; }; -struct dsa_device_ops { - netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); - int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); -}; - - /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 8a8a5d976f97..a55c4e6a4f0f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -77,7 +77,7 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; -struct dsa_device_ops; +struct packet_type; struct dsa_switch_tree { /* @@ -91,7 +91,10 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; - const struct dsa_device_ops *ops; + int (*rcv)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev); enum dsa_tag_protocol tag_protocol; /* @@ -218,7 +221,6 @@ static inline void *ds_to_priv(struct dsa_switch *ds) static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { - return dst->tag_protocol != DSA_TAG_PROTO_NONE; + return dst->rcv != NULL; } - #endif diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 61f145c44555..1df0a7cf1e9e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -154,9 +153,34 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, * tagging protocol to the preferred tagging format of this * switch. */ - if (ds->dst->cpu_switch == index) - ds->dst->tag_protocol = drv->tag_protocol; + if (dst->cpu_switch == index) { + switch (drv->tag_protocol) { +#ifdef CONFIG_NET_DSA_TAG_DSA + case DSA_TAG_PROTO_DSA: + dst->rcv = dsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + case DSA_TAG_PROTO_EDSA: + dst->rcv = edsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + case DSA_TAG_PROTO_TRAILER: + dst->rcv = trailer_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case DSA_TAG_PROTO_BRCM: + dst->rcv = brcm_netdev_ops.rcv; + break; +#endif + default: + break; + } + dst->tag_protocol = drv->tag_protocol; + } /* * Do basic register setup. @@ -626,7 +650,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } - return dst->ops->rcv(skb, dev, pt, orig_dev); + return dst->rcv(skb, dev, pt, orig_dev); } static struct packet_type dsa_pack_type __read_mostly = { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 98afed4d92ba..f90899e8ab5a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -12,7 +12,13 @@ #define __DSA_PRIV_H #include -#include +#include + +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; struct dsa_slave_priv { /* @@ -20,6 +26,8 @@ struct dsa_slave_priv { * switch port. */ struct net_device *dev; + netdev_tx_t (*xmit)(struct sk_buff *skb, + struct net_device *dev); /* * Which switch this port is a part of, and the port index @@ -43,6 +51,7 @@ struct dsa_slave_priv { extern char dsa_driver_version[]; /* slave.c */ +extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); struct net_device *dsa_slave_create(struct dsa_switch *ds, struct device *parent, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 809eeb13eb12..e38a331111c0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include @@ -176,9 +175,8 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch_tree *dst = p->parent->dst; - return dst->ops->xmit(skb, dev); + return p->xmit(skb, dev); } static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, @@ -325,11 +323,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; -static const struct dsa_device_ops notag_netdev_ops = { - .xmit = dsa_slave_notag_xmit, - .rcv = NULL, -}; - static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -435,41 +428,41 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->tx_queue_len = 0; slave_dev->netdev_ops = &dsa_slave_netdev_ops; + SET_NETDEV_DEV(slave_dev, parent); + slave_dev->dev.of_node = ds->pd->port_dn[port]; + slave_dev->vlan_features = master->vlan_features; + + p = netdev_priv(slave_dev); + p->dev = slave_dev; + p->parent = ds; + p->port = port; + switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case DSA_TAG_PROTO_DSA: - ds->dst->ops = &dsa_netdev_ops; + p->xmit = dsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case DSA_TAG_PROTO_EDSA: - ds->dst->ops = &edsa_netdev_ops; + p->xmit = edsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case DSA_TAG_PROTO_TRAILER: - ds->dst->ops = &trailer_netdev_ops; + p->xmit = trailer_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_BRCM case DSA_TAG_PROTO_BRCM: - ds->dst->ops = &brcm_netdev_ops; + p->xmit = brcm_netdev_ops.xmit; break; #endif default: - ds->dst->ops = ¬ag_netdev_ops; + p->xmit = dsa_slave_notag_xmit; break; } - SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->pd->port_dn[port]; - slave_dev->vlan_features = master->vlan_features; - - p = netdev_priv(slave_dev); - p->dev = slave_dev; - p->parent = ds; - p->port = port; - p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8fbc21c0de78..83d3572cdb20 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -11,7 +11,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index d7dbc5bda5c0..ce90c8bdc658 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 6b30abe89183..94fcce778679 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5fe9444842c5..115fdca34077 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" -- cgit v1.2.3 From b4d2394d01bc642e95b2cba956d908423c1bef77 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 15 Sep 2014 13:00:27 -0400 Subject: dsa: Replace mii_bus with a generic host device This change makes it so that instead of passing and storing a mii_bus we instead pass and store a host_dev. From there we can test to determine the exact type of device, and can verify it is the correct device for our switch. So for example it would be possible to pass a device pointer from a pci_dev and instead of checking for a PHY ID we could check for a vendor and/or device ID. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- arch/arm/plat-orion/common.c | 2 +- drivers/net/dsa/bcm_sf2.c | 2 +- drivers/net/dsa/mv88e6060.c | 13 +++++++++---- drivers/net/dsa/mv88e6123_61_65.c | 6 +++++- drivers/net/dsa/mv88e6131.c | 6 +++++- drivers/net/dsa/mv88e6171.c | 6 +++++- drivers/net/dsa/mv88e6xxx.c | 4 ++-- include/net/dsa.h | 9 +++++---- net/dsa/dsa.c | 24 ++++++++---------------- net/dsa/slave.c | 2 +- 10 files changed, 42 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c index 3ec6e8e8d368..f5b00f41c4f6 100644 --- a/arch/arm/plat-orion/common.c +++ b/arch/arm/plat-orion/common.c @@ -499,7 +499,7 @@ void __init orion_ge00_switch_init(struct dsa_platform_data *d, int irq) d->netdev = &orion_ge00.dev; for (i = 0; i < d->nr_chips; i++) - d->chip[i].mii_bus = &orion_ge00_shared.dev; + d->chip[i].host_dev = &orion_ge00_shared.dev; orion_switch_device.dev.platform_data = d; platform_device_register(&orion_switch_device); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index e9918c7f1792..02d7db320d90 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -129,7 +129,7 @@ static int bcm_sf2_sw_get_sset_count(struct dsa_switch *ds) return BCM_SF2_STATS_SIZE; } -static char *bcm_sf2_sw_probe(struct mii_bus *bus, int sw_addr) +static char *bcm_sf2_sw_probe(struct device *host_dev, int sw_addr) { return "Broadcom Starfighter 2"; } diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index d8037c1055ac..776e965dc9f4 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -21,7 +21,8 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg) { - return mdiobus_read(ds->master_mii_bus, ds->pd->sw_addr + addr, reg); + return mdiobus_read(to_mii_bus(ds->master_dev), + ds->pd->sw_addr + addr, reg); } #define REG_READ(addr, reg) \ @@ -37,8 +38,8 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg) static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) { - return mdiobus_write(ds->master_mii_bus, ds->pd->sw_addr + addr, - reg, val); + return mdiobus_write(to_mii_bus(ds->master_dev), + ds->pd->sw_addr + addr, reg, val); } #define REG_WRITE(addr, reg, val) \ @@ -50,10 +51,14 @@ static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) return __ret; \ }) -static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6060_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = mdiobus_read(bus, sw_addr + REG_PORT(0), 0x03); if (ret >= 0) { ret &= 0xfff0; diff --git a/drivers/net/dsa/mv88e6123_61_65.c b/drivers/net/dsa/mv88e6123_61_65.c index 975774f338c2..a332c53ff955 100644 --- a/drivers/net/dsa/mv88e6123_61_65.c +++ b/drivers/net/dsa/mv88e6123_61_65.c @@ -17,10 +17,14 @@ #include #include "mv88e6xxx.h" -static char *mv88e6123_61_65_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6123_61_65_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { if (ret == 0x1212) diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c index 35541f2ceca3..244c735014fa 100644 --- a/drivers/net/dsa/mv88e6131.c +++ b/drivers/net/dsa/mv88e6131.c @@ -22,10 +22,14 @@ #define ID_6095 0x0950 #define ID_6131 0x1060 -static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6131_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { ret &= 0xfff0; diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c index 03a70069a8c6..6365e30138af 100644 --- a/drivers/net/dsa/mv88e6171.c +++ b/drivers/net/dsa/mv88e6171.c @@ -17,10 +17,14 @@ #include #include "mv88e6xxx.h" -static char *mv88e6171_probe(struct mii_bus *bus, int sw_addr) +static char *mv88e6171_probe(struct device *host_dev, int sw_addr) { + struct mii_bus *bus = dsa_host_dev_to_mii_bus(host_dev); int ret; + if (bus == NULL) + return NULL; + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { if ((ret & 0xfff0) == 0x1710) diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 901d2a9704ef..d6f6428b27dc 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -78,7 +78,7 @@ int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg) int ret; mutex_lock(&ps->smi_mutex); - ret = __mv88e6xxx_reg_read(ds->master_mii_bus, + ret = __mv88e6xxx_reg_read(to_mii_bus(ds->master_dev), ds->pd->sw_addr, addr, reg); mutex_unlock(&ps->smi_mutex); @@ -122,7 +122,7 @@ int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) int ret; mutex_lock(&ps->smi_mutex); - ret = __mv88e6xxx_reg_write(ds->master_mii_bus, + ret = __mv88e6xxx_reg_write(to_mii_bus(ds->master_dev), ds->pd->sw_addr, addr, reg, val); mutex_unlock(&ps->smi_mutex); diff --git a/include/net/dsa.h b/include/net/dsa.h index a55c4e6a4f0f..c779e9bba1b3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -34,7 +34,7 @@ struct dsa_chip_data { /* * How to access the switch configuration registers. */ - struct device *mii_bus; + struct device *host_dev; int sw_addr; /* Device tree node pointer for this specific switch chip @@ -134,9 +134,9 @@ struct dsa_switch { struct dsa_switch_driver *drv; /* - * Reference to mii bus to use. + * Reference to host device to use. */ - struct mii_bus *master_mii_bus; + struct device *master_dev; /* * Slave mii_bus and devices for the individual ports. @@ -178,7 +178,7 @@ struct dsa_switch_driver { /* * Probing and setup. */ - char *(*probe)(struct mii_bus *bus, int sw_addr); + char *(*probe)(struct device *host_dev, int sw_addr); int (*setup)(struct dsa_switch *ds); int (*set_addr)(struct dsa_switch *ds, u8 *addr); @@ -213,6 +213,7 @@ struct dsa_switch_driver { void register_switch_driver(struct dsa_switch_driver *type); void unregister_switch_driver(struct dsa_switch_driver *type); +struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); static inline void *ds_to_priv(struct dsa_switch *ds) { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 1df0a7cf1e9e..b34d6978d773 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -43,7 +43,7 @@ void unregister_switch_driver(struct dsa_switch_driver *drv) EXPORT_SYMBOL_GPL(unregister_switch_driver); static struct dsa_switch_driver * -dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) +dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name) { struct dsa_switch_driver *ret; struct list_head *list; @@ -58,7 +58,7 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) drv = list_entry(list, struct dsa_switch_driver, list); - name = drv->probe(bus, sw_addr); + name = drv->probe(host_dev, sw_addr); if (name != NULL) { ret = drv; break; @@ -75,7 +75,7 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) /* basic switch operations **************************************************/ static struct dsa_switch * dsa_switch_setup(struct dsa_switch_tree *dst, int index, - struct device *parent, struct mii_bus *bus) + struct device *parent, struct device *host_dev) { struct dsa_chip_data *pd = dst->pd->chip + index; struct dsa_switch_driver *drv; @@ -88,7 +88,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(bus, pd->sw_addr, &name); + drv = dsa_switch_probe(host_dev, pd->sw_addr, &name); if (drv == NULL) { printk(KERN_ERR "%s[%d]: could not detect attached switch\n", dst->master_netdev->name, index); @@ -109,8 +109,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->index = index; ds->pd = dst->pd->chip + index; ds->drv = drv; - ds->master_mii_bus = bus; - + ds->master_dev = host_dev; /* * Validate supplied switch configuration. @@ -285,7 +284,7 @@ static struct device *dev_find_class(struct device *parent, char *class) return device_find_child(parent, class, dev_is_class); } -static struct mii_bus *dev_to_mii_bus(struct device *dev) +struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) { struct device *d; @@ -301,6 +300,7 @@ static struct mii_bus *dev_to_mii_bus(struct device *dev) return NULL; } +EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); static struct net_device *dev_to_net_device(struct device *dev) { @@ -566,17 +566,9 @@ static int dsa_probe(struct platform_device *pdev) dst->cpu_port = -1; for (i = 0; i < pd->nr_chips; i++) { - struct mii_bus *bus; struct dsa_switch *ds; - bus = dev_to_mii_bus(pd->chip[i].mii_bus); - if (bus == NULL) { - printk(KERN_ERR "%s[%d]: no mii bus found for " - "dsa switch\n", dev->name, i); - continue; - } - - ds = dsa_switch_setup(dst, i, &pdev->dev, bus); + ds = dsa_switch_setup(dst, i, &pdev->dev, pd->chip[i].host_dev); if (IS_ERR(ds)) { printk(KERN_ERR "%s[%d]: couldn't create dsa switch " "instance (error %ld)\n", dev->name, i, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e38a331111c0..90c9689ed362 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -44,7 +44,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", ds->index, ds->pd->sw_addr); - ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; + ds->slave_mii_bus->parent = ds->master_dev; } -- cgit v1.2.3 From 80dcbd12fb30932bb001b913ffe36843e2a1fe3a Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 15 Sep 2014 14:21:50 -0700 Subject: net_sched: fix suspicious RCU usage in cls_bpf_classify() Fixes: commit 1f947bf151e90ec0baad2948 ("net: sched: rcu'ify cls_bpf") Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6a7386e6e5a8..4e3f5bfc0b26 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -52,7 +52,7 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct cls_bpf_head *head = rcu_dereference(tp->root); + struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; int ret; -- cgit v1.2.3 From a57a65ba47b71e7af67af30466c7e0bd2ec5786d Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 15 Sep 2014 14:06:46 -0700 Subject: net_sched: fix an allocation bug in tcindex_set_parms() Fixes: commit 331b72922c5f58d48fd ("net: sched: RCU cls_tcindex") Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index a9f4279fbd69..a02ca7298385 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -241,7 +241,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, * allocate new tcindex data and RCU assign it onto root. Keeping * perfect hash and hash pointers from old data. */ - cp = kzalloc(sizeof(cp), GFP_KERNEL); + cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) return -ENOMEM; -- cgit v1.2.3 From 2f9a220eff18d31cf82b92e74d5bbc8a3d7043d8 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 15 Sep 2014 14:06:48 -0700 Subject: net_sched: fix suspicious RCU usage in tcindex_classify() This patch fixes the following kernel warning: [ 44.805900] [ INFO: suspicious RCU usage. ] [ 44.808946] 3.17.0-rc4+ #610 Not tainted [ 44.811831] ------------------------------- [ 44.814873] net/sched/cls_tcindex.c:84 suspicious rcu_dereference_check() usage! Fixes: commit 331b72922c5f58d48fd ("net: sched: RCU cls_tcindex") Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index a02ca7298385..b93974c4570c 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -81,7 +81,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = rcu_dereference(tp->root); + struct tcindex_data *p = rcu_dereference_bh(tp->root); struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; -- cgit v1.2.3 From 10ee1c34bedcc0e2a196d85ec87806fd111b5e79 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 15 Sep 2014 14:06:49 -0700 Subject: net_sched: use tcindex_filter_result_init() Fixes: commit 331b72922c5f58d48fd ("net: sched: RCU cls_tcindex") Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index b93974c4570c..dd2d691f0bbb 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -260,10 +260,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } cp->h = p->h; - memset(&new_filter_result, 0, sizeof(new_filter_result)); - tcf_exts_init(&new_filter_result.exts, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - + tcindex_filter_result_init(&new_filter_result); tcindex_filter_result_init(&cr); if (old_r) cr.res = r->res; -- cgit v1.2.3 From c1f570a6abc192f047550743f9957b617af605af Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 15 Sep 2014 14:48:08 -0700 Subject: net: dsa: fix mii_bus to host_dev replacement dsa_of_probe() still used cd->mii_bus instead of cd->host_dev when building with CONFIG_OF=y. Fix this by making the replacement here as well. Fixes: b4d2394d01b ("dsa: Replace mii_bus with a generic host device") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index b34d6978d773..6e40928ec0e7 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -440,7 +440,7 @@ static int dsa_of_probe(struct platform_device *pdev) cd = &pd->chip[chip_index]; cd->of_node = child; - cd->mii_bus = &mdio_bus->dev; + cd->host_dev = &mdio_bus->dev; sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) -- cgit v1.2.3 From 616a9be25cb9516e546c0de55d61e1e46e54ade9 Mon Sep 17 00:00:00 2001 From: Kenny Mathis Date: Tue, 9 Sep 2014 09:20:15 -0400 Subject: ipvs: Add simple weighted failover scheduler Add simple weighted IPVS failover support to the Linux kernel. All other scheduling modules implement some form of load balancing, while this offers a simple failover solution. Connections are directed to the appropriate server based solely on highest weight value and server availability. Tested functionality with keepalived. Signed-off-by: Kenny Mathis Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/Kconfig | 10 ++++++ net/netfilter/ipvs/Makefile | 1 + net/netfilter/ipvs/ip_vs_fo.c | 79 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 net/netfilter/ipvs/ip_vs_fo.c (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 0c3b1670b0d1..3b6929dec748 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -152,6 +152,16 @@ config IP_VS_WLC If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_FO + tristate "weighted failover scheduling" + ---help--- + The weighted failover scheduling algorithm directs network + connections to the server with the highest weight that is + currently available. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 34ee602ddb66..38b2723b2e3d 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c new file mode 100644 index 000000000000..6a2647d471ff --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -0,0 +1,79 @@ +/* + * IPVS: Weighted Fail Over module + * + * Authors: Kenny Mathis + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Kenny Mathis : added initial functionality based on weight + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* Weighted Fail Over Module */ +static struct ip_vs_dest * +ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *hweight = NULL; + int hw = 0; /* Track highest weight */ + + IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n"); + + /* Basic failover functionality + * Find virtual server with highest weight and send it traffic + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > hw) { + hweight = dest; + hw = atomic_read(&dest->weight); + } + } + + if (hweight) { + IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &hweight->addr), + ntohs(hweight->port), + atomic_read(&hweight->activeconns), + atomic_read(&hweight->weight)); + return hweight; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_fo_scheduler = { + .name = "fo", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list), + .schedule = ip_vs_fo_schedule, +}; + +static int __init ip_vs_fo_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_fo_scheduler); +} + +static void __exit ip_vs_fo_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_fo_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_fo_init); +module_exit(ip_vs_fo_cleanup); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 6cff339bbd5f9eda7a5e8a521f91a88d046e6d0c Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:20 -0700 Subject: ipvs: Add destination address family to netlink interface This is necessary to support heterogeneous pools. For example, if you have an ipv6 addressed network, you'll want to be able to forward ipv4 traffic into it. This patch enforces that destination address family is the same as service family, as none of the forwarding mechanisms support anything else. For the old setsockopt mechanism, we simply set the dest address family to AF_INET as we do with the service. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 +++ include/uapi/linux/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 45 +++++++++++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 624a8a54806d..b7e2b624d58e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -648,6 +648,9 @@ struct ip_vs_dest_user_kern { /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ + + /* Address family of addr */ + u16 af; }; diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index fbcffe8041f7..cabe95d5b461 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -384,6 +384,9 @@ enum { IPVS_DEST_ATTR_PERSIST_CONNS, /* persistent connections */ IPVS_DEST_ATTR_STATS, /* nested attribute for dest stats */ + + IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index bd2b208ba56c..594cec7ebc43 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -816,6 +816,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + dest->af = udest->af; + spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); @@ -846,8 +848,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); + /* Temporary for consistency */ + if (udest->af != svc->af) + return -EINVAL; + #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { + if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -875,12 +881,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, u64_stats_init(&ip_vs_dest_stats->syncp); } - dest->af = svc->af; + dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -928,7 +934,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -949,7 +955,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), @@ -992,7 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -2244,6 +2250,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; + udest->af = AF_INET; } static int @@ -2480,6 +2487,12 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; + /* Cannot expose heterogeneous members via sockopt + * interface + */ + if (dest->af != svc->af) + continue; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); @@ -2777,6 +2790,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, + [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3032,7 +3046,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns))) + atomic_read(&dest->persistconns)) || + nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3113,6 +3128,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; + struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || @@ -3121,6 +3137,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; + nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; @@ -3130,6 +3147,11 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); + if (nla_addr_family) + udest->af = nla_get_u16(nla_addr_family); + else + udest->af = 0; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, @@ -3357,6 +3379,15 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) need_full_dest); if (ret) goto out; + + /* Old protocols did not allow the user to specify address + * family, so we set it to zero instead. We also didn't + * allow heterogeneous pools in the old code, so it's safe + * to assume that this will have the same address family as + * the service. + */ + if (udest.af == 0) + udest.af = svc->af; } switch (cmd) { -- cgit v1.2.3 From 655eef103d0bd99f540a52f7ede032e120756846 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:21 -0700 Subject: ipvs: Supply destination addr family to ip_vs_{lookup_dest,find_dest} We need to remove the assumption that virtual address family is the same as real address family in order to support heterogeneous services (that is, services with v4 vips and v6 backends or the opposite). Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 5 +++-- net/netfilter/ipvs/ip_vs_conn.c | 8 +++++++- net/netfilter/ipvs/ip_vs_ctl.c | 24 ++++++++++++------------ net/netfilter/ipvs/ip_vs_sync.c | 10 ++++++++-- 4 files changed, 30 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b7e2b624d58e..2fa1155b24f7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1399,8 +1399,9 @@ void ip_vs_unregister_nl_ioctl(void); int ip_vs_control_init(void); void ip_vs_control_cleanup(void); struct ip_vs_dest * -ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, - __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, +ip_vs_find_dest(struct net *net, int svc_af, int dest_af, + const union nf_inet_addr *daddr, __be16 dport, + const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags); void ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 610e19c0e13f..8f4c602bb34b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -616,7 +616,13 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; rcu_read_lock(); - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + + /* This function is only invoked by the synchronization code. We do + * not currently support heterogeneous pools with synchronization, + * so we can make the assumption that the svc_af is the same as the + * dest_af + */ + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 594cec7ebc43..c840d895bd1a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -574,8 +574,8 @@ bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, * Called under RCU lock. */ static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; @@ -583,9 +583,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { + if ((dest->af == dest_af) && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && + (dest->port == dport)) { /* HIT */ return dest; } @@ -602,7 +602,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -613,14 +613,14 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; - dest = ip_vs_lookup_dest(svc, daddr, port); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) - dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } @@ -938,7 +938,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { @@ -1002,7 +1002,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { @@ -1084,7 +1084,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index eadffb29dec0..edd266414b7d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -880,8 +880,14 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * but still handled. */ rcu_read_lock(); - dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark, flags); + /* This function is only invoked by the synchronization + * code. We do not currently support heterogeneous pools + * with synchronization, so we can make the assumption that + * the svc_af is the same as the dest_af + */ + dest = ip_vs_find_dest(net, type, type, daddr, dport, + param->vaddr, param->vport, protocol, + fwmark, flags); cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); rcu_read_unlock(); -- cgit v1.2.3 From ad147aa4dd2135e6d86e3329d4009283ba64287f Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:22 -0700 Subject: ipvs: Pass destination address family to ip_vs_trash_get_dest Part of a series of diffs to tease out destination family from virtual family. This diff just adds a parameter to ip_vs_trash_get and then uses it for comparison rather than svc->af. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c840d895bd1a..6bd2cc682137 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -657,8 +657,8 @@ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) * scheduling. */ static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); @@ -671,11 +671,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - if (dest->af == svc->af && - ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + if (dest->af == dest_af && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && @@ -950,7 +950,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); + dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " -- cgit v1.2.3 From ba38528aae6ee2d22226c6a78727ddc13512b068 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:23 -0700 Subject: ipvs: Supply destination address family to ip_vs_conn_new The assumption that dest af is equal to service af is now unreliable, so we must specify it manually so as not to copy just the first 4 bytes of a v6 address or doing an illegal read of 16 butes on a v6 address. We "lie" in two places: for synchronization (which we will explicitly disallow from happening when we have heterogeneous pools) and for black hole addresses where there's no real dest. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 ++- net/netfilter/ipvs/ip_vs_conn.c | 5 +++-- net/netfilter/ipvs/ip_vs_core.c | 9 +++++---- net/netfilter/ipvs/ip_vs_ftp.c | 6 ++++-- net/netfilter/ipvs/ip_vs_sync.c | 3 ++- 5 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2fa1155b24f7..7600dbe5780e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -535,6 +535,7 @@ struct ip_vs_conn { union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ + __u16 daf; /* Address family of the dest */ #ifdef CONFIG_NET_NS struct net *net; /* Name space */ #endif @@ -1213,7 +1214,7 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); -struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, +struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 8f4c602bb34b..fdb4880a3a79 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -854,7 +854,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(const struct ip_vs_conn_param *p, +ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { @@ -873,6 +873,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); ip_vs_conn_net_set(cp, p->net); cp->af = p->af; + cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; @@ -880,7 +881,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; - ip_vs_addr_set(p->af, &cp->daddr, daddr); + ip_vs_addr_set(cp->daf, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5c34e8d42e01..1f6ecb74821f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -328,7 +328,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ - ct = ip_vs_conn_new(¶m, &dest->addr, dport, + ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); @@ -357,7 +357,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, src_port, &iph->daddr, dst_port, ¶m); - cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, + skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; @@ -479,7 +480,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &dest->addr, + cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); if (!cp) { @@ -550,7 +551,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &daddr, 0, + cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 77c173282f38..a64fa15790e5 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -233,7 +233,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); - n_cp = ip_vs_conn_new(&p, &from, port, + /* As above, this is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -396,7 +397,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - n_cp = ip_vs_conn_new(&p, &cp->daddr, + /* This is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index edd266414b7d..7162c86fd50d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -889,7 +889,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, param->vaddr, param->vport, protocol, fwmark, flags); - cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, + fwmark); rcu_read_unlock(); if (!cp) { kfree(param->pe_data); -- cgit v1.2.3 From 391f503d69779867f05e9296ae523e9002c2d7ee Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:24 -0700 Subject: ipvs: prevent mixing heterogeneous pools and synchronization The synchronization protocol is not compatible with heterogeneous pools, so we need to verify that we're not turning both on at the same time. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ net/netfilter/ipvs/ip_vs_ctl.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7600dbe5780e..576d7f0bed5d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -990,6 +990,10 @@ struct netns_ipvs { char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* net name space ptr */ struct net *net; /* Needed by timer routines */ + /* Number of heterogeneous destinations, needed because + * heterogeneous are not supported when synchronization is + * enabled */ + unsigned int mixed_address_family_dests; }; #define DEFAULT_SYNC_THRESHOLD 3 diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6bd2cc682137..462760eded94 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -779,6 +779,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_scheduler *sched; int conn_flags; + /* We cannot modify an address and change the address family */ + BUG_ON(!add && udest->af != dest->af); + + if (add && udest->af != svc->af) + ipvs->mixed_address_family_dests++; + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; @@ -1061,6 +1067,9 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, list_del_rcu(&dest->n_list); svc->num_dests--; + if (dest->af != svc->af) + net_ipvs(svc->net)->mixed_address_family_dests--; + if (svcupd) { struct ip_vs_scheduler *sched; @@ -3256,6 +3265,12 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + /* The synchronization protocol is incompatible with mixed family + * services + */ + if (net_ipvs(net)->mixed_address_family_dests > 0) + return -EINVAL; + return start_sync_thread(net, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), -- cgit v1.2.3 From 4a4739d56b006c4b34dfba03c356056e110521ca Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:25 -0700 Subject: ipvs: Pull out crosses_local_route_boundary logic This logic is repeated in both out_rt functions so it was redundant. Additionally, we'll need to be able to do checks to route v4 to v6 and vice versa in order to deal with heterogeneous pools. This patch also updates the callsites to add an additional parameter to the out route functions. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 144 +++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 56896a412bce..b3b54d7a6c17 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -157,9 +157,56 @@ retry: return rt; } +#ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} +#endif + +static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, + int rt_mode, + bool new_rt_is_local) +{ + bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); + bool source_is_loopback; + bool old_rt_is_local; + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); + + source_is_loopback = + (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (addr_type & IPV6_ADDR_LOOPBACK); + old_rt_is_local = __ip_vs_is_local_route6( + (struct rt6_info *)skb_dst(skb)); + } else +#endif + { + source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); + old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + } + + if (unlikely(new_rt_is_local)) { + if (!rt_mode_allow_local) + return true; + if (!rt_mode_allow_redirect && !old_rt_is_local) + return true; + } else { + if (!rt_mode_allow_non_local) + return true; + if (source_is_loopback) + return true; + } + return false; +} + /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -218,30 +265,15 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; - if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & - rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", - (rt->rt_flags & RTCF_LOCAL) ? - "local":"non-local", &daddr); + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI4\n", &dest->addr.ip); goto err_put; } iph = ip_hdr(skb); - if (likely(!local)) { - if (unlikely(ipv4_is_loopback(iph->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address " - "%pI4 to non-local address, dest: %pI4\n", - &iph->saddr, &daddr); - goto err_put; - } - } else { - ort = skb_rtable(skb); - if (!(rt_mode & IP_VS_RT_MODE_RDR) && - !(ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to " - "local requires NAT method, dest: %pI4\n", - &iph->daddr, &daddr); - goto err_put; - } + + if (unlikely(local)) { /* skb to local stack, preserve old route */ if (!noref) ip_rt_put(rt); @@ -295,12 +327,6 @@ err_unreach: } #ifdef CONFIG_IP_VS_IPV6 - -static inline int __ip_vs_is_local_route6(struct rt6_info *rt) -{ - return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; -} - static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) @@ -339,7 +365,7 @@ out_err: * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { @@ -393,32 +419,15 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, } local = __ip_vs_is_local_route6(rt); - if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & - rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", - local ? "local":"non-local", daddr); + + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI6\n", &dest->addr.in6); goto err_put; } - if (likely(!local)) { - if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address " - "%pI6c to non-local address, " - "dest: %pI6c\n", - &ipv6_hdr(skb)->saddr, daddr); - goto err_put; - } - } else { - ort = (struct rt6_info *) skb_dst(skb); - if (!(rt_mode & IP_VS_RT_MODE_RDR) && - !__ip_vs_is_local_route6(ort)) { - IP_VS_DBG_RL("Redirect from non-local address %pI6c " - "to local requires NAT method, " - "dest: %pI6c\n", - &ipv6_hdr(skb)->daddr, daddr); - goto err_put; - } + + if (unlikely(local)) { /* skb to local stack, preserve old route */ if (!noref) dst_release(&rt->dst); @@ -556,8 +565,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, - NULL) < 0) + if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, + IP_VS_RT_MODE_NON_LOCAL, NULL) < 0) goto tx_error; ip_send_check(iph); @@ -586,7 +595,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -633,7 +642,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL); @@ -721,8 +730,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR); @@ -829,7 +838,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | @@ -928,7 +937,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1021,7 +1030,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL); @@ -1060,8 +1069,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL); if (local < 0) @@ -1128,7 +1137,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + NULL); if (local < 0) goto tx_error; rt = skb_rtable(skb); @@ -1219,8 +1229,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, rt_mode); + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; rt = (struct rt6_info *) skb_dst(skb); -- cgit v1.2.3 From 919aa0b2bbcf013467295dc9736db6fb575a4fb0 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:26 -0700 Subject: ipvs: Pull out update_pmtu code Another step toward heterogeneous pools, this removes another piece of functionality currently specific to each address family type. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b3b54d7a6c17..034a282a6f8c 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -204,6 +204,15 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, return false; } +static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) +{ + struct sock *sk = skb->sk; + struct rtable *ort = skb_rtable(skb); + + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, @@ -213,7 +222,6 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ - struct rtable *ort; /* Original route */ struct iphdr *iph; __be16 df; int mtu; @@ -284,16 +292,12 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, mtu = dst_mtu(&rt->dst); df = iph->frag_off & htons(IP_DF); } else { - struct sock *sk = skb->sk; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; } - ort = skb_rtable(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + maybe_update_pmtu(skb_af, skb, mtu); /* MTU check allowed? */ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } @@ -372,7 +376,6 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct net *net = dev_net(skb_dst(skb)->dev); struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ - struct rt6_info *ort; /* Original route */ struct dst_entry *dst; int mtu; int local, noref = 1; @@ -438,17 +441,13 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) mtu = dst_mtu(&rt->dst); else { - struct sock *sk = skb->sk; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); goto err_put; } - ort = (struct rt6_info *) skb_dst(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + maybe_update_pmtu(skb_af, skb, mtu); } if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { -- cgit v1.2.3 From c63e4de2be5e1d253adce16dbba57ed42868bc22 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:27 -0700 Subject: ipvs: Add generic ensure_mtu_is_adequate to handle mixed pools The out_rt functions check to see if the mtu is large enough for the packet and, if not, send icmp messages (TOOBIG or DEST_UNREACH) to the source and bail out. We needed the ability to send ICMP from the out_rt_v6 function and DEST_UNREACH from the out_rt function, so we just pulled it out into a common function. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 77 +++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 034a282a6f8c..fa2fdd7421b7 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -213,17 +213,57 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } +static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, + struct ip_vs_iphdr *ipvsh, + struct sk_buff *skb, int mtu) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + struct net *net = dev_net(skb_dst(skb)->dev); + + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", + &ipv6_hdr(skb)->saddr); + return false; + } + } else +#endif + { + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + /* If we're going to tunnel the packet and pmtu discovery + * is disabled, we'll just fragment it anyway + */ + if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) + return true; + + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", + &ip_hdr(skb)->saddr); + return false; + } + } + + return true; +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, int rt_mode, __be32 *ret_saddr) + __be32 daddr, int rt_mode, __be32 *ret_saddr, + struct ip_vs_iphdr *ipvsh) { struct net *net = dev_net(skb_dst(skb)->dev); - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ - struct iphdr *iph; - __be16 df; int mtu; int local, noref = 1; @@ -279,7 +319,6 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, " daddr=%pI4\n", &dest->addr.ip); goto err_put; } - iph = ip_hdr(skb); if (unlikely(local)) { /* skb to local stack, preserve old route */ @@ -290,7 +329,6 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { mtu = dst_mtu(&rt->dst); - df = iph->frag_off & htons(IP_DF); } else { mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { @@ -298,16 +336,10 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_put; } maybe_update_pmtu(skb_af, skb, mtu); - /* MTU check allowed? */ - df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } - /* MTU checking */ - if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; - } skb_dst_drop(skb); if (noref) { @@ -450,15 +482,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { - if (!skb->dev) - skb->dev = net->loopback_dev; - /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; - } skb_dst_drop(skb); if (noref) { @@ -565,7 +590,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rcu_read_lock(); if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, - IP_VS_RT_MODE_NON_LOCAL, NULL) < 0) + IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; ip_send_check(iph); @@ -644,7 +669,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL); + IP_VS_RT_MODE_RDR, NULL, ipvsh); if (local < 0) goto tx_error; rt = skb_rtable(skb); @@ -841,7 +866,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | - IP_VS_RT_MODE_TUNNEL, &saddr); + IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); if (local < 0) goto tx_error; if (local) { @@ -1032,7 +1057,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_KNOWN_NH, NULL); + IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); if (local < 0) goto tx_error; if (local) { @@ -1137,7 +1162,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, - NULL); + NULL, iph); if (local < 0) goto tx_error; rt = skb_rtable(skb); -- cgit v1.2.3 From 8052ba292559f907ea2ad4c827d83c195046dfe1 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:28 -0700 Subject: ipvs: support ipv4 in ipv6 and ipv6 in ipv4 tunnel forwarding Pull the common logic for preparing an skb to prepend the header into a single function and then set fields such that they can be used in either case (generalize tos and tclass to dscp, hop_limit and ttl to ttl, etc) Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 12 +++- net/netfilter/ipvs/ip_vs_xmit.c | 148 +++++++++++++++++++++++++++++----------- 2 files changed, 117 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index fdb4880a3a79..13e9cee02c81 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -488,7 +488,12 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) break; case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else +#endif + cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: @@ -514,7 +519,10 @@ static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) break; case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit_v6; + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else + cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index fa2fdd7421b7..91f17c1eb8a2 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -824,6 +824,81 @@ tx_error: } #endif +/* When forwarding a packet, we must ensure that we've got enough headroom + * for the encapsulation packet in the skb. This also gives us an + * opportunity to figure out what the payload_len, dsfield, ttl, and df + * values should be, so that we won't need to look at the old ip header + * again + */ +static struct sk_buff * +ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, + unsigned int max_headroom, __u8 *next_protocol, + __u32 *payload_len, __u8 *dsfield, __u8 *ttl, + __be16 *df) +{ + struct sk_buff *new_skb = NULL; + struct iphdr *old_iph = NULL; +#ifdef CONFIG_IP_VS_IPV6 + struct ipv6hdr *old_ipv6h = NULL; +#endif + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { + new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) + goto error; + consume_skb(skb); + skb = new_skb; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + old_ipv6h = ipv6_hdr(skb); + *next_protocol = IPPROTO_IPV6; + if (payload_len) + *payload_len = + ntohs(old_ipv6h->payload_len) + + sizeof(*old_ipv6h); + *dsfield = ipv6_get_dsfield(old_ipv6h); + *ttl = old_ipv6h->hop_limit; + if (df) + *df = 0; + } else +#endif + { + old_iph = ip_hdr(skb); + /* Copy DF, reset fragment offset and MF */ + if (df) + *df = (old_iph->frag_off & htons(IP_DF)); + *next_protocol = IPPROTO_IPIP; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + *dsfield = ipv4_get_dsfield(old_iph); + *ttl = old_iph->ttl; + if (payload_len) + *payload_len = ntohs(old_iph->tot_len); + } + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(-ENOMEM); +} + +static inline int __tun_gso_type_mask(int encaps_af, int orig_af) +{ + if (encaps_af == AF_INET) { + if (orig_af == AF_INET) + return SKB_GSO_IPIP; + + return SKB_GSO_SIT; + } + + /* GSO: we need to provide proper SKB_GSO_ value for IPv6: + * SKB_GSO_SIT/IPV6 + */ + return 0; +} /* * IP Tunneling transmitter @@ -852,9 +927,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df; + __u8 next_protocol = 0; + __u8 dsfield = 0; + __u8 ttl = 0; + __be16 df = 0; + __be16 *dfp = NULL; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; @@ -877,29 +954,21 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rt = skb_rtable(skb); tdev = rt->dst.dev; - /* Copy DF, reset fragment offset and MF */ - df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - - if (!new_skb) - goto tx_error; - consume_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - /* fix old IP header checksum */ - ip_send_check(old_iph); + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ + dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, NULL, &dsfield, + &ttl, dfp); + if (IS_ERR(skb)) + goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads( + skb, false, __tun_gso_type_mask(AF_INET, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -916,11 +985,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; + iph->protocol = next_protocol; + iph->tos = dsfield; iph->daddr = cp->daddr.ip; iph->saddr = saddr; - iph->ttl = old_iph->ttl; + iph->ttl = ttl; ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ @@ -953,7 +1022,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *old_iph = ipv6_hdr(skb); + __u8 next_protocol = 0; + __u32 payload_len = 0; + __u8 dsfield = 0; + __u8 ttl = 0; struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; @@ -981,19 +1053,14 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - - if (!new_skb) - goto tx_error; - consume_skb(skb); - skb = new_skb; - old_iph = ipv6_hdr(skb); - } + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, &payload_len, + &dsfield, &ttl, NULL); + if (IS_ERR(skb)) + goto tx_error; - /* GSO: we need to provide proper SKB_GSO_ value for IPv6 */ - skb = iptunnel_handle_offloads(skb, false, 0); /* SKB_GSO_SIT/IPV6 */ + skb = iptunnel_handle_offloads( + skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -1008,14 +1075,13 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ iph = ipv6_hdr(skb); iph->version = 6; - iph->nexthdr = IPPROTO_IPV6; - iph->payload_len = old_iph->payload_len; - be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); + iph->nexthdr = next_protocol; + iph->payload_len = htons(payload_len); memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - ipv6_change_dsfield(iph, 0, ipv6_get_dsfield(old_iph)); + ipv6_change_dsfield(iph, 0, dsfield); iph->daddr = cp->daddr.in6; iph->saddr = saddr; - iph->hop_limit = old_iph->hop_limit; + iph->hop_limit = ttl; /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; -- cgit v1.2.3 From f7fa38006983c0933a550fa790a3b3d3856394d1 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Sep 2014 16:40:29 -0700 Subject: ipvs: address family of LBLC entry depends on svc family The LBLC entries should use svc->af, not dest->af. Needed to support svc->af != dest->af. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 547ff33c1efd..127f14046c51 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -199,11 +199,11 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, - struct ip_vs_dest *dest) + u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblc_entry *en; - en = ip_vs_lblc_get(dest->af, tbl, daddr); + en = ip_vs_lblc_get(af, tbl, daddr); if (en) { if (en->dest == dest) return en; @@ -213,8 +213,8 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, if (!en) return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; ip_vs_dest_hold(dest); @@ -521,13 +521,13 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) - ip_vs_lblc_new(tbl, &iph->daddr, dest); + ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } -- cgit v1.2.3 From cf34e646dad101170e00712fe51986cbcdad3044 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Sep 2014 16:40:30 -0700 Subject: ipvs: address family of LBLCR entry depends on svc family The LBLCR entries should use svc->af, not dest->af. Needed to support svc->af != dest->af. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblcr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 3f21a2f47de1..2229d2d8bbe0 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -362,18 +362,18 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, - struct ip_vs_dest *dest) + u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblcr_entry *en; - en = ip_vs_lblcr_get(dest->af, tbl, daddr); + en = ip_vs_lblcr_get(af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); if (!en) return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; /* initialize its dest set */ @@ -706,13 +706,13 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) - ip_vs_lblcr_new(tbl, &iph->daddr, dest); + ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } -- cgit v1.2.3 From 2ff3e4e4868675da1024175215991fa6d9856731 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 15 Sep 2014 19:15:28 -0700 Subject: openvswitch: Remove pkt_key from OVS_CB OVS keeps pointer to packet key in skb->cb, but the packet key is store on stack. This could make code bit tricky. So it is better to get rid of the pointer. Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 22 ++++++++++++---------- net/openvswitch/datapath.c | 6 ++---- net/openvswitch/datapath.h | 5 ++--- 3 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 5231652a95d9..8a1eb562cdec 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -38,6 +38,7 @@ #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, int len); static int make_writable(struct sk_buff *skb, int write_len) @@ -410,16 +411,14 @@ static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) } static int output_userspace(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr) { struct dp_upcall_info upcall; const struct nlattr *a; int rem; - BUG_ON(!OVS_CB(skb)->pkt_key); - upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = OVS_CB(skb)->pkt_key; + upcall.key = key; upcall.userdata = NULL; upcall.portid = 0; @@ -445,7 +444,7 @@ static bool last_action(const struct nlattr *a, int rem) } static int sample(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -493,7 +492,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, * return the error code and let the caller (also * do_execute_actions()) free skb on error. */ - return do_execute_actions(dp, sample_skb, a, rem); + return do_execute_actions(dp, sample_skb, key, a, rem); } static int execute_set_action(struct sk_buff *skb, @@ -544,6 +543,7 @@ static int execute_set_action(struct sk_buff *skb, /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, int len) { /* Every output action needs a separate clone of 'skb', but the common @@ -569,7 +569,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, a); + output_userspace(dp, skb, key, a); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -587,7 +587,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, a); + err = sample(dp, skb, key, a); if (unlikely(err)) /* skb already freed. */ return err; break; @@ -608,10 +608,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } /* Execute a list of actions against 'skb'. */ -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key) { struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); OVS_CB(skb)->tun_key = NULL; - return do_execute_actions(dp, skb, acts->actions, acts->actions_len); + return do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 91d66b7e64ac..458096da138a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -275,10 +275,9 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } OVS_CB(skb)->flow = flow; - OVS_CB(skb)->pkt_key = &key; ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); - ovs_execute_actions(dp, skb); + ovs_execute_actions(dp, skb, &key); stats_counter = &stats->n_hit; out: @@ -568,7 +567,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; OVS_CB(packet)->flow = flow; - OVS_CB(packet)->pkt_key = &flow->key; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; @@ -579,7 +577,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_unlock; local_bh_disable(); - err = ovs_execute_actions(dp, packet); + err = ovs_execute_actions(dp, packet, &flow->key); local_bh_enable(); rcu_read_unlock(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 701b5738c38a..b576483aab7c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -95,13 +95,11 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. - * @pkt_key: The flow information extracted from the packet. Must be nonnull. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. */ struct ovs_skb_cb { struct sw_flow *flow; - struct sw_flow_key *pkt_key; struct ovs_key_ipv4_tunnel *tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -191,7 +189,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *, struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *); void ovs_dp_notify_wq(struct work_struct *work); #define OVS_NLERR(fmt, ...) \ -- cgit v1.2.3 From 83c8df26a3b654871c0503fcf6eac61777e12ea1 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 15 Sep 2014 19:20:31 -0700 Subject: openvswitch: refactor ovs flow extract API. OVS flow extract is called on packet receive or packet execute code path. Following patch defines separate API for extracting flow-key in packet execute code path. Signed-off-by: Pravin B Shelar Acked-by: Andy Zhou --- net/openvswitch/datapath.c | 21 ++++++++++++------ net/openvswitch/datapath.h | 5 ++++- net/openvswitch/flow.c | 49 ++++++++++++++++++++++++++++++------------ net/openvswitch/flow.h | 6 +++++- net/openvswitch/flow_netlink.c | 22 ++++++------------- net/openvswitch/flow_netlink.h | 4 ++-- net/openvswitch/vport.c | 5 +++-- 7 files changed, 71 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 458096da138a..0cce8e60d5ed 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -237,8 +237,9 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +void ovs_dp_process_received_packet(struct sk_buff *skb) { + const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct dp_stats_percpu *stats; @@ -250,7 +251,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) stats = this_cpu_ptr(dp->stats_percpu); /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key); + error = ovs_flow_key_extract(skb, &key); if (unlikely(error)) { kfree_skb(skb); return; @@ -514,6 +515,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct ethhdr *eth; + struct vport *input_vport; int len; int err; @@ -548,13 +550,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key); + err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, + &flow->key); if (err) goto err_flow_free; - err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]); - if (err) - goto err_flow_free; acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) @@ -576,6 +576,15 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!dp) goto err_unlock; + input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (!input_vport) + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); + + if (!input_vport) + goto err_unlock; + + OVS_CB(packet)->input_vport = input_vport; + local_bh_disable(); err = ovs_execute_actions(dp, packet, &flow->key); local_bh_enable(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index b576483aab7c..2b982fae6a11 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -97,10 +97,13 @@ struct datapath { * @flow: The flow associated with this packet. May be %NULL if no flow. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. + * @input_vport: The original vport packet came in on. This value is cached + * when a packet is received by OVS. */ struct ovs_skb_cb { struct sw_flow *flow; struct ovs_key_ipv4_tunnel *tun_key; + struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -181,7 +184,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_process_received_packet(struct sk_buff *); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 7064da92f420..d186eb65a391 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -16,8 +16,6 @@ * 02110-1301, USA */ -#include "flow.h" -#include "datapath.h" #include #include #include @@ -46,6 +44,10 @@ #include #include +#include "datapath.h" +#include "flow.h" +#include "flow_netlink.h" + u64 ovs_flow_used_time(unsigned long flow_jiffies) { struct timespec cur_ts; @@ -420,10 +422,9 @@ invalid: } /** - * ovs_flow_extract - extracts a flow key from an Ethernet frame. + * key_extract - extracts a flow key from an Ethernet frame. * @skb: sk_buff that contains the frame, with skb->data pointing to the * Ethernet header - * @in_port: port number on which @skb was received. * @key: output flow key * * The caller must ensure that skb->len >= ETH_HLEN. @@ -442,19 +443,11 @@ invalid: * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { int error; struct ethhdr *eth; - memset(key, 0, sizeof(*key)); - - key->phy.priority = skb->priority; - if (OVS_CB(skb)->tun_key) - memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); - key->phy.in_port = in_port; - key->phy.skb_mark = skb->mark; - skb_reset_mac_header(skb); /* Link layer. We are guaranteed to have at least the 14 byte Ethernet @@ -610,6 +603,34 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) } } } - return 0; } + +int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key) +{ + /* Extract metadata from packet. */ + memset(key, 0, sizeof(*key)); + if (OVS_CB(skb)->tun_key) + memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); + + key->phy.priority = skb->priority; + key->phy.in_port = OVS_CB(skb)->input_vport->port_no; + key->phy.skb_mark = skb->mark; + + return key_extract(skb, key); +} + +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key) +{ + int err; + + memset(key, 0, sizeof(*key)); + /* Extract metadata from netlink attributes. */ + err = ovs_nla_get_flow_metadata(attr, key); + if (err) + return err; + + return key_extract(skb, key); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 5e5aaed3a85b..251789b6ec45 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -187,6 +187,10 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); +int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key); +/* Extract key from packet coming from userspace. */ +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d757848da89c..630b320fbf3e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -836,7 +836,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /** * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. - * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @key: Receives extracted in_port, priority, tun_key and skb_mark. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -846,32 +846,24 @@ int ovs_nla_get_match(struct sw_flow_match *match, * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(struct sw_flow *flow, - const struct nlattr *attr) +int ovs_nla_get_flow_metadata(const struct nlattr *attr, + struct sw_flow_key *key) { - struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + struct sw_flow_match match; u64 attrs = 0; int err; - struct sw_flow_match match; - - flow->key.phy.in_port = DP_MAX_PORTS; - flow->key.phy.priority = 0; - flow->key.phy.skb_mark = 0; - memset(tun_key, 0, sizeof(flow->key.tun_key)); err = parse_flow_nlattrs(attr, a, &attrs); if (err) return -EINVAL; memset(&match, 0, sizeof(match)); - match.key = &flow->key; + match.key = key; - err = metadata_from_nlattrs(&match, &attrs, a, false); - if (err) - return err; + key->phy.in_port = DP_MAX_PORTS; - return 0; + return metadata_from_nlattrs(&match, &attrs, a, false); } int ovs_nla_put_flow(const struct sw_flow_key *swkey, diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 440151045d39..206e45add888 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -42,8 +42,8 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_flow(const struct sw_flow_key *, const struct sw_flow_key *, struct sk_buff *); -int ovs_nla_get_flow_metadata(struct sw_flow *flow, - const struct nlattr *attr); +int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *); + int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *, const struct nlattr *); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index f7e63f9df7b9..acf31aa89e01 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -443,7 +443,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->tun_key = tun_key; - ovs_dp_process_received_packet(vport, skb); + OVS_CB(skb)->input_vport = vport; + ovs_dp_process_received_packet(skb); } /** -- cgit v1.2.3 From 8c8b1b83fcdd0f05e1f66ed6f8a2e831d5d374a2 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 15 Sep 2014 19:28:44 -0700 Subject: openvswitch: Use tun_key only for egress tunnel path. Currently tun_key is used for passing tunnel information on ingress and egress path, this cause confusion. Following patch removes its use on ingress path make it egress only parameter. Signed-off-by: Pravin B Shelar Acked-by: Andy Zhou --- net/openvswitch/actions.c | 3 +-- net/openvswitch/datapath.c | 20 ++++++-------------- net/openvswitch/datapath.h | 8 ++++---- net/openvswitch/flow.c | 7 ++++--- net/openvswitch/flow.h | 3 ++- net/openvswitch/vport-gre.c | 23 ++++++++++++----------- net/openvswitch/vport-vxlan.c | 21 +++++++++++---------- net/openvswitch/vport.c | 12 ++++++++++-- 8 files changed, 50 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8a1eb562cdec..1cdb539d05bd 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -510,7 +510,7 @@ static int execute_set_action(struct sk_buff *skb, break; case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->tun_key = nla_data(nested_attr); + OVS_CB(skb)->egress_tun_key = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -613,7 +613,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); - OVS_CB(skb)->tun_key = NULL; return do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0cce8e60d5ed..7e0819919b8a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -237,33 +237,25 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct sk_buff *skb) +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct dp_stats_percpu *stats; - struct sw_flow_key key; u64 *stats_counter; u32 n_mask_hit; - int error; stats = this_cpu_ptr(dp->stats_percpu); - /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(skb, &key); - if (unlikely(error)) { - kfree_skb(skb); - return; - } - /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; + int error; upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.key = &key; + upcall.key = key; upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); error = ovs_dp_upcall(dp, skb, &upcall); @@ -277,8 +269,8 @@ void ovs_dp_process_received_packet(struct sk_buff *skb) OVS_CB(skb)->flow = flow; - ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); - ovs_execute_actions(dp, skb, &key); + ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb); + ovs_execute_actions(dp, skb, key); stats_counter = &stats->n_hit; out: diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 2b982fae6a11..25b0e888cb27 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -95,15 +95,15 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. - * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the - * packet is not being tunneled. + * @egress_tun_key: Tunnel information about this packet on egress path. + * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. */ struct ovs_skb_cb { struct sw_flow *flow; - struct ovs_key_ipv4_tunnel *tun_key; struct vport *input_vport; + struct ovs_key_ipv4_tunnel *egress_tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -184,7 +184,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -void ovs_dp_process_received_packet(struct sk_buff *); +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index d186eb65a391..bf8442071d75 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -606,12 +606,13 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } -int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key) +int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, + struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ memset(key, 0, sizeof(*key)); - if (OVS_CB(skb)->tun_key) - memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); + if (tun_key) + memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 251789b6ec45..3869a540365c 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -187,7 +187,8 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, + struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index f49148a07da2..309cca6e816f 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -63,7 +63,7 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; + const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key; struct tnl_ptk_info tpi; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); @@ -129,6 +129,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); + struct ovs_key_ipv4_tunnel *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -136,16 +137,17 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_key)) { err = -EINVAL; goto error; } + tun_key = OVS_CB(skb)->egress_tun_key; /* Route lookup */ memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -153,7 +155,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) if (IS_ERR(rt)) return PTR_ERR(rt); - tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); + tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr) @@ -185,15 +187,14 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_rt; } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df, false); + tun_key->ipv4_dst, IPPROTO_GRE, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); err_free_rt: ip_rt_put(rt); error: diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index d8b7e247bebf..f19539bb8adc 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2014 Nicira, Inc. * Copyright (c) 2013 Cisco Systems, Inc. * * This program is free software; you can redistribute it and/or @@ -140,22 +140,24 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct ovs_key_ipv4_tunnel *tun_key; struct rtable *rt; struct flowi4 fl; __be16 src_port; __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_key)) { err = -EINVAL; goto error; } + tun_key = OVS_CB(skb)->egress_tun_key; /* Route lookup */ memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_UDP; @@ -165,7 +167,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; @@ -173,11 +175,10 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) src_port = udp_flow_src_port(net, skb, 0, 0, true); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, - fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8), + htonl(be64_to_cpu(tun_key->tun_id) << 8), false); if (err < 0) ip_rt_put(rt); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index acf31aa89e01..5df8377fcfb1 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -435,6 +435,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, struct ovs_key_ipv4_tunnel *tun_key) { struct pcpu_sw_netstats *stats; + struct sw_flow_key key; + int error; stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); @@ -442,9 +444,15 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->tun_key = tun_key; OVS_CB(skb)->input_vport = vport; - ovs_dp_process_received_packet(skb); + OVS_CB(skb)->egress_tun_key = NULL; + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_key_extract(tun_key, skb, &key); + if (unlikely(error)) { + kfree_skb(skb); + return; + } + ovs_dp_process_packet(skb, &key); } /** -- cgit v1.2.3 From 32ae87ff795781b7ceffc44b7c694c1bb206a266 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 15 Sep 2014 19:33:50 -0700 Subject: openvswitch: simplify sample action implementation The current sample() function implementation is more complicated than necessary in handling single user space action optimization and skb reference counting. There is no functional changes. Signed-off-by: Andy Zhou Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1cdb539d05bd..c31bb80c984f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -448,7 +448,6 @@ static int sample(struct datapath *dp, struct sk_buff *skb, { const struct nlattr *acts_list = NULL; const struct nlattr *a; - struct sk_buff *sample_skb; int rem; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; @@ -468,31 +467,26 @@ static int sample(struct datapath *dp, struct sk_buff *skb, rem = nla_len(acts_list); a = nla_data(acts_list); - /* Actions list is either empty or only contains a single user-space - * action, the latter being a special case as it is the only known - * usage of the sample action. - * In these special cases don't clone the skb as there are no - * side-effects in the nested actions. - * Otherwise, clone in case the nested actions have side effects. - */ - if (likely(rem == 0 || (nla_type(a) == OVS_ACTION_ATTR_USERSPACE && - last_action(a, rem)))) { - sample_skb = skb; - skb_get(skb); - } else { - sample_skb = skb_clone(skb, GFP_ATOMIC); - if (!sample_skb) /* Skip sample action when out of memory. */ - return 0; - } + /* Actions list is empty, do nothing */ + if (unlikely(!rem)) + return 0; - /* Note that do_execute_actions() never consumes skb. - * In the case where skb has been cloned above it is the clone that - * is consumed. Otherwise the skb_get(skb) call prevents - * consumption by do_execute_actions(). Thus, it is safe to simply - * return the error code and let the caller (also - * do_execute_actions()) free skb on error. + /* The only known usage of sample action is having a single user-space + * action. Treat this usage as a special case. + * The output_userspace() should clone the skb to be sent to the + * user space. This skb will be consumed by its caller. */ - return do_execute_actions(dp, sample_skb, key, a, rem); + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && + last_action(a, rem))) + return output_userspace(dp, skb, key, a); + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + /* Skip the sample action when out of memory. */ + return 0; + + /* do_execute_actions() will consume the cloned skb. */ + return do_execute_actions(dp, skb, key, a, rem); } static int execute_set_action(struct sk_buff *skb, -- cgit v1.2.3 From 971427f353f3c42c8dcef62e7124440df68eb809 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 15 Sep 2014 19:37:25 -0700 Subject: openvswitch: Add recirc and hash action. Recirc action allows a packet to reenter openvswitch processing. currently openvswitch lookup flow for packet received and execute set of actions on that packet, with help of recirc action we can process/modify the packet and recirculate it back in openvswitch for another pass. OVS hash action calculates 5-tupple hash and set hash in flow-key hash. This can be used along with recirculation for distributing packets among different ports for bond devices. For example: OVS bonding can use following actions: Match on: bond flow; Action: hash, recirc(id) Match on: recirc-id == id and hash lower bits == a; Action: output port_bond_a Signed-off-by: Andy Zhou Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 26 +++++ net/openvswitch/actions.c | 203 +++++++++++++++++++++++++++++++++++++-- net/openvswitch/datapath.c | 11 ++- net/openvswitch/datapath.h | 7 +- net/openvswitch/flow.c | 7 +- net/openvswitch/flow.h | 3 + net/openvswitch/flow_netlink.c | 43 ++++++++- 7 files changed, 288 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a794d1dd7b40..f7fc507d82ab 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -289,6 +289,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ + OVS_KEY_ATTR_DP_HASH, /* u32 hash value. Value 0 indicates the hash + is not computed by the datapath. */ + OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -493,6 +496,27 @@ struct ovs_action_push_vlan { __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; +/* Data path hash algorithm for computing Datapath hash. + * + * The algorithm type only specifies the fields in a flow + * will be used as part of the hash. Each datapath is free + * to use its own hash algorithm. The hash value will be + * opaque to the user space daemon. + */ +enum ovs_hash_alg { + OVS_HASH_ALG_L4, +}; + +/* + * struct ovs_action_hash - %OVS_ACTION_ATTR_HASH action argument. + * @hash_alg: Algorithm used to compute hash prior to recirculation. + * @hash_basis: basis used for computing hash. + */ +struct ovs_action_hash { + uint32_t hash_alg; /* One of ovs_hash_alg. */ + uint32_t hash_basis; +}; + /** * enum ovs_action_attr - Action types. * @@ -521,6 +545,8 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */ OVS_ACTION_ATTR_POP_VLAN, /* No argument. */ OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ + OVS_ACTION_ATTR_RECIRC, /* u32 recirc_id. */ + OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ __OVS_ACTION_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c31bb80c984f..6932a42e41a2 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -35,12 +35,78 @@ #include #include "datapath.h" +#include "flow.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len); +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} + static int make_writable(struct sk_buff *skb, int write_len) { if (!pskb_may_pull(skb, write_len)) @@ -485,8 +551,29 @@ static int sample(struct datapath *dp, struct sk_buff *skb, /* Skip the sample action when out of memory. */ return 0; - /* do_execute_actions() will consume the cloned skb. */ - return do_execute_actions(dp, skb, key, a, rem); + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; } static int execute_set_action(struct sk_buff *skb, @@ -535,6 +622,44 @@ static int execute_set_action(struct sk_buff *skb, return err; } +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + + if (!last_action(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -566,6 +691,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, key, a); break; + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -576,6 +705,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = pop_vlan(skb); break; + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (last_action(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, nla_data(a)); break; @@ -601,12 +741,63 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return 0; } +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key) { - struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + int level = this_cpu_read(exec_actions_level); + struct sw_flow_actions *acts; + int err; + + acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + this_cpu_inc(exec_actions_level); + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); + + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; - return do_execute_actions(dp, skb, key, - acts->actions, acts->actions_len); + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7e0819919b8a..16cad14fa81e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -156,7 +156,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -static const char *ovs_dp_name(const struct datapath *dp) +const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -2065,10 +2065,14 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = ovs_internal_dev_rtnl_link_register(); + err = action_fifos_init(); if (err) goto error; + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; @@ -2101,6 +2105,8 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); error: return err; } @@ -2114,6 +2120,7 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 25b0e888cb27..ac3f3df96961 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -189,13 +189,18 @@ void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); +const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *); + void ovs_dp_notify_wq(struct work_struct *work); +int action_fifos_init(void); +void action_fifos_exit(void); + #define OVS_NLERR(fmt, ...) \ do { \ if (net_ratelimit()) \ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bf8442071d75..4010423f2831 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -606,6 +606,11 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 3869a540365c..0f5db4ec565d 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -72,6 +72,8 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -187,6 +189,7 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 630b320fbf3e..f4c8daa73965 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -251,6 +251,8 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), + [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, }; @@ -454,6 +456,20 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { SW_FLOW_KEY_PUT(match, phy.priority, nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); @@ -873,6 +889,12 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, struct nlattr *nla, *encap; bool is_mask = (swkey != output); + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; @@ -1401,11 +1423,13 @@ int ovs_nla_copy_actions(const struct nlattr *attr, /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1432,6 +1456,18 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } case OVS_ACTION_ATTR_POP_VLAN: break; @@ -1444,6 +1480,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_RECIRC: + break; + case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, &skip_copy); if (err) -- cgit v1.2.3 From 44b75e43178276f57141c314661526496e85a3ab Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 15 Sep 2014 16:43:42 -0700 Subject: net_sched: fix memory leak in cls_tcindex Fixes: commit 331b72922c5f58d48fd ("net: sched: RCU cls_tcindex") Cc: John Fastabend Signed-off-by: Cong Wang Acked-By: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index dd2d691f0bbb..ee525426b3d4 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -242,8 +242,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, * perfect hash and hash pointers from old data. */ cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - return -ENOMEM; + if (!cp) { + err = -ENOMEM; + goto errout; + } cp->mask = p->mask; cp->shift = p->shift; @@ -257,6 +259,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, sizeof(*r) * cp->hash, GFP_KERNEL); if (!cp->perfect) goto errout; + balloc = 1; } cp->h = p->h; @@ -282,9 +285,9 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (cp->perfect) { if (!valid_perfect_hash(cp) || cp->hash > cp->alloc_hash) - goto errout; + goto errout_alloc; } else if (cp->h && cp->hash != cp->alloc_hash) { - goto errout; + goto errout_alloc; } err = -EINVAL; @@ -311,7 +314,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, */ if (cp->perfect || valid_perfect_hash(cp)) if (handle >= cp->alloc_hash) - goto errout; + goto errout_alloc; err = -ENOMEM; @@ -321,7 +324,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); if (!cp->perfect) - goto errout; + goto errout_alloc; for (i = 0; i < cp->hash; i++) tcf_exts_init(&cp->perfect[i].exts, TCA_TCINDEX_ACT, @@ -335,7 +338,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, GFP_KERNEL); if (!hash) - goto errout; + goto errout_alloc; cp->h = hash; balloc = 2; -- cgit v1.2.3 From 69301eaa7fdab984c55c227359905fd522dbfc9c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 15 Sep 2014 16:43:43 -0700 Subject: net_sched: fix a null pointer dereference in tcindex_set_parms() This patch fixes the following crash: [ 42.199159] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [ 42.200027] IP: [] tcindex_set_parms+0x45c/0x526 [ 42.200027] PGD d2319067 PUD d4ffe067 PMD 0 [ 42.200027] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 42.200027] CPU: 0 PID: 541 Comm: tc Not tainted 3.17.0-rc4+ #603 [ 42.200027] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 42.200027] task: ffff8800d22d2670 ti: ffff8800ce790000 task.ti: ffff8800ce790000 [ 42.200027] RIP: 0010:[] [] tcindex_set_parms+0x45c/0x526 [ 42.200027] RSP: 0018:ffff8800ce793898 EFLAGS: 00010202 [ 42.200027] RAX: 0000000000000001 RBX: ffff8800d1786498 RCX: 0000000000000000 [ 42.200027] RDX: ffffffff82114ec8 RSI: ffffffff82114ec8 RDI: ffffffff82114ec8 [ 42.200027] RBP: ffff8800ce793958 R08: 00000000000080d0 R09: 0000000000000001 [ 42.200027] R10: ffff8800ce7939a0 R11: 0000000000000246 R12: ffff8800d017d238 [ 42.200027] R13: 0000000000000018 R14: ffff8800d017c6a0 R15: ffff8800d1786620 [ 42.200027] FS: 00007f4e24539740(0000) GS:ffff88011a600000(0000) knlGS:0000000000000000 [ 42.200027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 42.200027] CR2: 0000000000000018 CR3: 00000000cff38000 CR4: 00000000000006f0 [ 42.200027] Stack: [ 42.200027] ffff8800ce0949f0 0000000000000000 0000000200000003 ffff880000000000 [ 42.200027] ffff8800ce7938b8 ffff8800ce7938b8 0000000600000007 0000000000000000 [ 42.200027] ffff8800ce7938d8 ffff8800ce7938d8 0000000600000007 ffff8800ce0949f0 [ 42.200027] Call Trace: [ 42.200027] [] tcindex_change+0xdb/0xee [ 42.200027] [] tc_ctl_tfilter+0x44d/0x63f [ 42.200027] [] rtnetlink_rcv_msg+0x181/0x194 [ 42.200027] [] ? rtnl_lock+0x17/0x19 [ 42.200027] [] ? __rtnl_unlock+0x17/0x17 [ 42.200027] [] netlink_rcv_skb+0x49/0x8b [ 43.462494] [] rtnetlink_rcv+0x23/0x2a [ 43.462494] [] netlink_unicast+0xc7/0x148 [ 43.462494] [] netlink_sendmsg+0x5cb/0x63d [ 43.462494] [] ? mark_lock+0x2e/0x224 [ 43.462494] [] __sock_sendmsg_nosec+0x25/0x27 [ 43.462494] [] sock_sendmsg+0x57/0x71 [ 43.462494] [] ? might_fault+0x57/0xa4 [ 43.462494] [] ? might_fault+0xa0/0xa4 [ 43.462494] [] ? might_fault+0x57/0xa4 [ 43.462494] [] ? verify_iovec+0x69/0xb7 [ 43.462494] [] ___sys_sendmsg+0x21d/0x2bb [ 43.462494] [] ? native_sched_clock+0x35/0x37 [ 43.462494] [] ? sched_clock_local+0x12/0x72 [ 43.462494] [] ? mark_lock+0x2e/0x224 [ 43.462494] [] ? sched_clock_cpu+0xa0/0xb9 [ 43.462494] [] ? __lock_acquire+0x5fe/0xde4 [ 43.462494] [] ? rcu_read_lock_held+0x36/0x38 [ 43.462494] [] ? __fcheck_files.isra.7+0x4b/0x57 [ 43.462494] [] ? __fget_light+0x30/0x54 [ 43.462494] [] __sys_sendmsg+0x42/0x60 [ 43.462494] [] SyS_sendmsg+0x12/0x1c [ 43.462494] [] system_call_fastpath+0x16/0x1b 'p->h' could be NULL while 'cp->h' is always update to date. Fixes: commit 331b72922c5f58d48fd ("net: sched: RCU cls_tcindex") Cc: John Fastabend Signed-off-by: Cong Wang Acked-By: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ee525426b3d4..5054fae33a48 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -381,7 +381,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, f->result = new_filter_result; f->next = NULL; - fp = p->h + (handle % p->hash); + fp = cp->h + (handle % cp->hash); for (nfp = rtnl_dereference(*fp); nfp; fp = &nfp->next, nfp = rtnl_dereference(*fp)) -- cgit v1.2.3 From 80aab73de4a076fc70ad5cc60395d935c40e605d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 15 Sep 2014 23:30:26 -0700 Subject: net: sched: fix unsued cpu variable kbuild test robot reported an unused variable cpu in cls_u32.c after the patch below. This happens when PERF and MARK config variables are disabled Fix this is to use separate variables for perf and mark and define the cpu variable inside the ifdef logic. Fixes: 459d5f626da7 ("net: sched: make cls_u32 per cpu")' Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 5ed5ac4361b1..8cffe5a27007 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -788,8 +788,8 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, } else { #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt *gpf; -#endif int cpu; +#endif if (nla_put(skb, TCA_U32_SEL, sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), @@ -816,9 +816,10 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct tc_u32_mark mark = {.val = n->val, .mask = n->mask, .success = 0}; + int cpum; - for_each_possible_cpu(cpu) { - __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpu); + for_each_possible_cpu(cpum) { + __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum); mark.success += cnt; } -- cgit v1.2.3 From a96366bf263919c529baa74a0b029c82a8388045 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 15 Sep 2014 23:30:49 -0700 Subject: net: sched: cls_u32 add missing rcu_assign_pointer and annotation Add missing rcu_assign_pointer and missing annotation for ht_up in cls_u32.c Caught by kbuild bot, >> net/sched/cls_u32.c:378:36: sparse: incorrect type in initializer (different address spaces) net/sched/cls_u32.c:378:36: expected struct tc_u_hnode *ht net/sched/cls_u32.c:378:36: got struct tc_u_hnode [noderef] *ht_up >> net/sched/cls_u32.c:610:54: sparse: incorrect type in argument 4 (different address spaces) net/sched/cls_u32.c:610:54: expected struct tc_u_hnode *ht net/sched/cls_u32.c:610:54: got struct tc_u_hnode [noderef] *ht_up >> net/sched/cls_u32.c:684:18: sparse: incorrect type in assignment (different address spaces) net/sched/cls_u32.c:684:18: expected struct tc_u_hnode [noderef] *ht_up net/sched/cls_u32.c:684:18: got struct tc_u_hnode *[assigned] ht >> net/sched/cls_u32.c:359:18: sparse: dereference of noderef expression Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 8cffe5a27007..eceeb0456d26 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -375,7 +375,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_knode __rcu **kp; struct tc_u_knode *pkp; - struct tc_u_hnode *ht = key->ht_up; + struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); if (ht) { kp = &ht->ht[TC_U32_HASH(key->handle)]; @@ -607,7 +607,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(net, tp, base, n->ht_up, n, tb, + return u32_set_parms(net, tp, base, + rtnl_dereference(n->ht_up), n, tb, tca[TCA_RATE], ovr); } @@ -681,7 +682,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, #endif memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); - n->ht_up = ht; + RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); -- cgit v1.2.3 From d14cbfc88ff87e5054d67fde3ba5f4c20b773dab Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 15 Sep 2014 23:31:17 -0700 Subject: net: sched: cls_cgroup fix possible memory leak of 'new' tree: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master head: 54996b529ab70ca1d6f40677cd2698c4f7127e87 commit: c7953ef23042b7c4fc2be5ecdd216aacff6df5eb [625/646] net: sched: cls_cgroup use RCU net/sched/cls_cgroup.c:130 cls_cgroup_change() warn: possible memory leak of 'new' net/sched/cls_cgroup.c:135 cls_cgroup_change() warn: possible memory leak of 'new' net/sched/cls_cgroup.c:139 cls_cgroup_change() warn: possible memory leak of 'new' Fixes: c7953ef23042b7c4fc2be5ecdd216aac ("net: sched: cls_cgroup use RCU") Reported-by: Dan Carpenter Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3b7548759998..10c7ffde13e2 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -127,16 +127,18 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], cgroup_policy); if (err < 0) - return err; + goto errout; tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) - return err; + goto errout; err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); - if (err < 0) - return err; + if (err < 0) { + tcf_exts_destroy(tp, &e); + goto errout; + } tcf_exts_change(tp, &new->exts, &e); tcf_em_tree_change(tp, &new->ematches, &t); @@ -145,6 +147,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (head) call_rcu(&head->rcu, cls_cgroup_destroy_rcu); return 0; +errout: + kfree(new); + return err; } static void cls_cgroup_destroy(struct tcf_proto *tp) -- cgit v1.2.3 From e1f93eb06c3a13b29b1980f27dada960503cd49e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 15 Sep 2014 23:31:42 -0700 Subject: net: sched: cls_fw: add missing tcf_exts_init call in fw_change() When allocating a new structure we also need to call tcf_exts_init to initialize exts. A follow up patch might be in order to remove some of this code and do tcf_exts_assign(). With this we could remove the tcf_exts_init/tcf_exts_change pattern for some of the classifiers. As part of the future tcf_actions RCU series this will need to be done. For now fix the call here. Fixes e35a8ee5993ba81fd6c0 ("net: sched: fw use RCU") Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_fw.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 006b45a67fdd..2650285620ee 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -264,6 +264,8 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, #endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; + tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); if (err < 0) { kfree(fnew); -- cgit v1.2.3 From 9f6c38e70b6c7ea379394a755fe76e09996f5370 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Sep 2014 00:33:42 -0700 Subject: net: sched: cls_cgroup need tcf_exts_init in all cases This ensures the tcf_exts_init() is called for all cases. Fixes: 952313bd62589cae216a57 ("net: sched: cls_cgroup use RCU") Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 10c7ffde13e2..15c34d4ccd9e 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -116,12 +116,11 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOBUFS; - if (head) { + tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + if (head) new->handle = head->handle; - } else { - tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + else new->handle = handle; - } new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], -- cgit v1.2.3 From 0097db06f5ab2df1756bc4cbf4395593024d87a1 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 16 Sep 2014 21:36:09 +0200 Subject: Bluetooth: Remove exported hci_recv_fragment function The hci_recv_fragment function is no longer used by any driver and thus do not export it. In fact it is not even needed by the core and it can be removed altogether. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 - net/bluetooth/hci_core.c | 20 -------------------- 2 files changed, 21 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 206b92bfeebb..37ff1aef0845 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -926,7 +926,6 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); -int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count); int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count); void hci_init_sysfs(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 41948678f514..cb05d7f16a34 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4374,26 +4374,6 @@ static int hci_reassembly(struct hci_dev *hdev, int type, void *data, return remain; } -int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count) -{ - int rem = 0; - - if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) - return -EILSEQ; - - while (count) { - rem = hci_reassembly(hdev, type, data, count, type - 1); - if (rem < 0) - return rem; - - data += (count - rem); - count = rem; - } - - return rem; -} -EXPORT_SYMBOL(hci_recv_fragment); - #define STREAM_REASSEMBLY 0 int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count) -- cgit v1.2.3 From 4d316f3f9ae3d5fad8d3198eec0a4ef2511471d7 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 17 Sep 2014 00:09:00 +0300 Subject: ipvs: use correct address family in scheduler logs Needed to support svc->af != dest->af. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_dh.c | 2 +- net/netfilter/ipvs/ip_vs_fo.c | 2 +- net/netfilter/ipvs/ip_vs_lc.c | 2 +- net/netfilter/ipvs/ip_vs_nq.c | 3 ++- net/netfilter/ipvs/ip_vs_rr.c | 2 +- net/netfilter/ipvs/ip_vs_sed.c | 3 ++- net/netfilter/ipvs/ip_vs_sh.c | 8 ++++---- net/netfilter/ipvs/ip_vs_wlc.c | 3 ++- net/netfilter/ipvs/ip_vs_wrr.c | 2 +- 9 files changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index c3b84546ea9e..6be5c538b71e 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -234,7 +234,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index 6a2647d471ff..e09874d02938 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -44,7 +44,7 @@ ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, if (hweight) { IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &hweight->addr), + IP_VS_DBG_ADDR(hweight->af, &hweight->addr), ntohs(hweight->port), atomic_read(&hweight->activeconns), atomic_read(&hweight->weight)); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 2bdcb1cf2127..19a0769a989a 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -59,7 +59,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, else IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " "inactconns %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), + IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->inactconns)); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 961a6de9bb29..a8b63401e773 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -107,7 +107,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, out: IP_VS_DBG_BUF(6, "NQ: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 176b87c35e34..58bacfc461ee 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -95,7 +95,7 @@ stop: spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index e446b9fa7424..f8e2d00f528b 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -108,7 +108,8 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "SED: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index cc65b2f42cd4..98a13433b68c 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -138,7 +138,7 @@ ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); /* if the original dest is unavailable, loop around the table * starting from ihash to find a new dest @@ -153,7 +153,7 @@ ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable " "server %s:%d (offset %d), reselecting", - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), roffset); } @@ -192,7 +192,7 @@ ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", - i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + i, IP_VS_DBG_ADDR(dest->af, &dest->addr), atomic_read(&dest->weight)); /* Don't move to next dest until filling weight */ @@ -342,7 +342,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->saddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index b5b4650d50a9..6b366fd90554 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -80,7 +80,8 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "WLC: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 0546cd572d6b..17e6d4406ca7 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -216,7 +216,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); -- cgit v1.2.3 From f18ae7206eaebfecc2dd8b017b0d6a0950eabf8b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Sep 2014 16:40:38 -0700 Subject: ipvs: use the new dest addr family field Use the new address family field cp->daf when printing cp->daddr in logs or connection listing. Signed-off-by: Julian Anastasov Signed-off-by: Alex Gartrell Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 49 +++++++++++++++++++++++++++-------- net/netfilter/ipvs/ip_vs_core.c | 6 ++--- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 2 +- 4 files changed, 43 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 13e9cee02c81..b0f7b626b56d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -77,6 +78,13 @@ static unsigned int ip_vs_conn_rnd __read_mostly; #define CT_LOCKARRAY_SIZE (1<protocol), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -685,7 +693,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) ip_vs_proto_name(cp->protocol), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -754,7 +762,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct) ntohs(ct->cport), IP_VS_DBG_ADDR(ct->af, &ct->vaddr), ntohs(ct->vport), - IP_VS_DBG_ADDR(ct->af, &ct->daddr), + IP_VS_DBG_ADDR(ct->daf, &ct->daddr), ntohs(ct->dport)); /* @@ -1051,6 +1059,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) struct net *net = seq_file_net(seq); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; + char dbuf[IP_VS_ADDRSTRLEN]; if (!ip_vs_conn_net_eq(cp, net)) return 0; @@ -1064,25 +1073,33 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) } pe_data[len] = '\0'; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " - "%pI6 %04X %-11s %7lu%s\n", + "%s %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), - &cp->daddr.in6, ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ, pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu%s\n", + " %s %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ, pe_data); } @@ -1120,6 +1137,7 @@ static const char *ip_vs_origin_name(unsigned int flags) static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) { + char dbuf[IP_VS_ADDRSTRLEN]; if (v == SEQ_START_TOKEN) seq_puts(seq, @@ -1131,13 +1149,22 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) if (!ip_vs_conn_net_eq(cp, net)) return 0; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%s %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), - &cp->daddr.in6, ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); @@ -1145,11 +1172,11 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) #endif seq_printf(seq, "%-3s %08X %04X %08X %04X " - "%08X %04X %-11s %-6s %7lu\n", + "%s %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 1f6ecb74821f..990decba1fe4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -492,9 +492,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), - IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 2f7ea7564044..5b84c0b56642 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -432,7 +432,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index e3a697234a98..8e92beb0cca9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -510,7 +510,7 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->af, &cp->daddr), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), -- cgit v1.2.3 From bc18d37f676f76edbb5e0c37def78c704b5fbed0 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:39 -0700 Subject: ipvs: Allow heterogeneous pools now that we support them Remove the temporary consistency check and add a case statement to only allow ipip mixed dests. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 462760eded94..ac7ba689efe7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -854,10 +854,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); - /* Temporary for consistency */ - if (udest->af != svc->af) - return -EINVAL; - #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); @@ -3403,6 +3399,26 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) */ if (udest.af == 0) udest.af = svc->af; + + if (udest.af != svc->af) { + /* The synchronization protocol is incompatible + * with mixed family services + */ + if (net_ipvs(net)->sync_state) { + ret = -EINVAL; + goto out; + } + + /* Which connection types do we support? */ + switch (udest.conn_flags) { + case IP_VS_CONN_F_TUNNEL: + /* We are able to forward this */ + break; + default: + ret = -EINVAL; + goto out; + } + } } switch (cmd) { -- cgit v1.2.3 From 689f1c9de2abbd76fda224d12cea5f43568a4335 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Sep 2014 16:38:18 +0800 Subject: ipsec: Remove obsolete MAX_AH_AUTH_LEN While tracking down the MAX_AH_AUTH_LEN crash in an old kernel I thought that this limit was rather arbitrary and we should just get rid of it. In fact it seems that we've already done all the work needed to remove it apart from actually removing it. This limit was there in order to limit stack usage. Since we've already switched over to allocating scratch space using kmalloc, there is no longer any need to limit the authentication length. This patch kills all references to it, including the BUG_ONs that led me here. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/ah.h | 3 --- net/ipv4/ah4.c | 2 -- net/ipv6/ah6.c | 2 -- net/xfrm/xfrm_user.c | 3 +-- 4 files changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/ah.h b/include/net/ah.h index ca95b98969dd..4e2dfa474a7e 100644 --- a/include/net/ah.h +++ b/include/net/ah.h @@ -3,9 +3,6 @@ #include -/* This is the maximum truncated ICV length that we know of. */ -#define MAX_AH_AUTH_LEN 64 - struct crypto_ahash; struct ah_data { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a2afa89513a0..ac9a32ec3ee4 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -505,8 +505,6 @@ static int ah_init_state(struct xfrm_state *x) ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; - BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index fcffd4e522c8..6d16eb0e0c7f 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -713,8 +713,6 @@ static int ah6_init_state(struct xfrm_state *x) ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; - BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); switch (x->props.mode) { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index eaf8a8f1cbe8..e812e988c111 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -333,8 +333,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, algo = xfrm_aalg_get_byname(ualg->alg_name, 1); if (!algo) return -ENOSYS; - if ((ualg->alg_trunc_len / 8) > MAX_AH_AUTH_LEN || - ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) + if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) return -EINVAL; *props = algo->desc.sadb_alg_id; -- cgit v1.2.3 From 5eb596f55cacc2389554a8d7572d90d5e9d4269d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 18 Sep 2014 11:26:32 +0300 Subject: Bluetooth: Fix setting correct security level when initiating SMP We can only determine the final security level when both pairing request and response have been exchanged. When initiating pairing the starting target security level is set to MEDIUM unless explicitly specified to be HIGH, so that we can still perform pairing even if the remote doesn't have MITM capabilities. However, once we've received the pairing response we should re-consult the remote and local IO capabilities and upgrade the target security level if necessary. Without this patch the resulting Long Term Key will occasionally be reported to be unauthenticated when it in reality is an authenticated one. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 51fc7db2d84e..f09b6b65cf6b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -494,8 +494,11 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, } /* Not Just Works/Confirm results in MITM Authentication */ - if (method != JUST_CFM) + if (method != JUST_CFM) { set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); + if (hcon->pending_sec_level < BT_SECURITY_HIGH) + hcon->pending_sec_level = BT_SECURITY_HIGH; + } /* If both devices have Keyoard-Display I/O, the master * Confirms and the slave Enters the passkey. -- cgit v1.2.3 From fc04733a1a71af26bf30830571b71f5f2a354a06 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 14:53:17 +0200 Subject: netfilter: nfnetlink: use original skbuff when committing/aborting This allows us to access the original content of the batch from the commit and the abort paths. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c138b8fbe280..f77d3f7f22b5 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -333,7 +333,7 @@ replay: * original skb. */ if (err == -EAGAIN) { - ss->abort(skb); + ss->abort(oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); goto replay; @@ -357,9 +357,9 @@ ack: } done: if (success && done) - ss->commit(skb); + ss->commit(oskb); else - ss->abort(skb); + ss->abort(oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); -- cgit v1.2.3 From 84d7fce693884897c6196cc98228a2ad56ae2a9a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Sep 2014 14:30:22 +0200 Subject: netfilter: nf_tables: export rule-set generation ID This patch exposes the ruleset generation ID in three ways: 1) The new command NFT_MSG_GETGEN that exposes the 32-bits ruleset generation ID. This ID is incremented in every commit and it should be large enough to avoid wraparound problems. 2) The less significant 16-bits of the generation ID are exposed through the nfgenmsg->res_id header field. This allows us to quickly catch if the ruleset has change between two consecutive list dumps from different object lists (in this specific case I think the risk of wraparound is unlikely). 3) Userspace subscribers may receive notifications of new rule-set generation after every commit. This also provides an alternative way to monitor the generation ID. If the events are lost, the userspace process hits a overrun error, so it knows that it is working with a stale ruleset anyway. Patrick spotted that rule-set transformations in userspace may take quite some time. In that case, it annotates the 32-bits generation ID before fetching the rule-set, then: 1) it compares it to what we obtain after the transformation to make sure it is not working with a stale rule-set and no wraparound has ocurred. 2) it subscribes to ruleset notifications, so it can watch for new generation ID. This is complementary to the NLM_F_DUMP_INTR approach, which allows us to detect an interference in the middle one single list dumping. There is no way to explicitly check that an interference has occurred between two list dumps from the kernel, since it doesn't know how many lists the userspace client is actually going to dump. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 16 ++++ net/netfilter/nf_tables_api.c | 140 +++++++++++++++++++++++++------ 2 files changed, 130 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66d66dd3ff79..b72ccfeaf865 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -51,6 +51,8 @@ enum nft_verdicts { * @NFT_MSG_NEWSETELEM: create a new set element (enum nft_set_elem_attributes) * @NFT_MSG_GETSETELEM: get a set element (enum nft_set_elem_attributes) * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) + * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) + * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -68,6 +70,8 @@ enum nf_tables_msg_types { NFT_MSG_NEWSETELEM, NFT_MSG_GETSETELEM, NFT_MSG_DELSETELEM, + NFT_MSG_NEWGEN, + NFT_MSG_GETGEN, NFT_MSG_MAX, }; @@ -812,4 +816,16 @@ enum nft_masq_attributes { }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) +/** + * enum nft_gen_attributes - nf_tables ruleset generation attributes + * + * @NFTA_GEN_ID: Ruleset generation ID (NLA_U32) + */ +enum nft_gen_attributes { + NFTA_GEN_UNSPEC, + NFTA_GEN_ID, + __NFTA_GEN_MAX +}; +#define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 82374601577e..a476b9962155 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -405,9 +405,9 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; -static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table) +static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -420,7 +420,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || @@ -448,8 +448,8 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -488,7 +488,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_table_info(skb, + if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, @@ -540,7 +540,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) @@ -914,9 +914,9 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table, +static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, const struct nft_chain *chain) { struct nlmsghdr *nlh; @@ -930,7 +930,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; @@ -988,8 +988,8 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -1031,7 +1031,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, @@ -1090,7 +1091,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) @@ -1647,8 +1648,9 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, }; -static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, +static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule) @@ -1668,7 +1670,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; @@ -1724,8 +1726,8 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -1771,7 +1773,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, @@ -1837,7 +1839,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule); if (err < 0) @@ -2321,7 +2323,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -2925,7 +2927,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) goto nla_put_failure; @@ -3006,7 +3008,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -3293,6 +3295,87 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, return err; } +static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -EMSGSIZE; +} + +static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + int err; + + if (nlmsg_report(nlh) && + !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + goto err; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) { + kfree_skb(skb2); + goto err; + } + + err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, + NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int err; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call_batch = nf_tables_newtable, @@ -3369,6 +3452,9 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETGEN] = { + .call = nf_tables_getgen, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -3526,6 +3612,8 @@ static int nf_tables_commit(struct sk_buff *skb) call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); } + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + return 0; } -- cgit v1.2.3 From fd384412e199b62c3ddaabd18dce86d0e164c5b9 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 16 Sep 2014 17:31:16 -0700 Subject: udp_tunnel: Seperate ipv6 functions into its own file. Add ip6_udp_tunnel.c for ipv6 UDP tunnel functions to avoid ifdefs in udp_tunnel.c Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 28 ++++++++++++++-- net/ipv4/udp_tunnel.c | 85 ++++++++++++----------------------------------- net/ipv6/Makefile | 1 + net/ipv6/ip6_udp_tunnel.c | 63 +++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 66 deletions(-) create mode 100644 net/ipv6/ip6_udp_tunnel.c (limited to 'net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index ffd69cbded35..0b9e017c9038 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -26,7 +26,31 @@ struct udp_port_cfg { use_udp6_rx_checksums:1; }; -int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp); +int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp); + +#if IS_ENABLED(CONFIG_IPV6) +int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp); +#else +static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + return 0; +} +#endif + +static inline int udp_sock_create(struct net *net, + struct udp_port_cfg *cfg, + struct socket **sockp) +{ + if (cfg->family == AF_INET) + return udp_sock_create4(net, cfg, sockp); + + if (cfg->family == AF_INET6) + return udp_sock_create6(net, cfg, sockp); + + return -EPFNOSUPPORT; +} #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 61ec1a65207e..7fccf6c3417a 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -8,83 +8,40 @@ #include #include -int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) +int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) { int err = -EINVAL; struct socket *sock = NULL; + struct sockaddr_in udp_addr; -#if IS_ENABLED(CONFIG_IPV6) - if (cfg->family == AF_INET6) { - struct sockaddr_in6 udp6_addr; + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; - - sk_change_net(sock->sk, net); - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, - sizeof(udp6_addr)); - if (err < 0) - goto error; - - if (cfg->peer_udp_port) { - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = cfg->peer_udp_port; - err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, - sizeof(udp6_addr), 0); - } - if (err < 0) - goto error; - - udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); - udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); - } else -#endif - if (cfg->family == AF_INET) { - struct sockaddr_in udp_addr; - - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; + sk_change_net(sock->sk, net); - sk_change_net(sock->sk, net); + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr)); + if (err < 0) + goto error; + if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->local_ip; - udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, - sizeof(udp_addr)); + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = cfg->peer_udp_port; + err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr), 0); if (err < 0) goto error; - - if (cfg->peer_udp_port) { - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->peer_ip; - udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, - (struct sockaddr *)&udp_addr, - sizeof(udp_addr), 0); - if (err < 0) - goto error; - } - - sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; - } else { - return -EPFNOSUPPORT; } + sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; *sockp = sock; - return 0; error: @@ -95,6 +52,6 @@ error: *sockp = NULL; return err; } -EXPORT_SYMBOL(udp_sock_create); +EXPORT_SYMBOL(udp_sock_create4); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2fe68364bb20..45f830e5f820 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_SIT) += sit.o diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c new file mode 100644 index 000000000000..bcfbb4b496fc --- /dev/null +++ b/net/ipv6/ip6_udp_tunnel.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + struct sockaddr_in6 udp6_addr; + int err; + struct socket *sock = NULL; + + err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr), 0); + } + if (err < 0) + goto error; + + udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); + udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); + + *sockp = sock; + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL_GPL(udp_sock_create6); -- cgit v1.2.3 From 6a93cc9052748c6355ec9d5b6c38b77f85f1cb0d Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 16 Sep 2014 17:31:17 -0700 Subject: udp-tunnel: Add a few more UDP tunnel APIs Added a few more UDP tunnel APIs that can be shared by UDP based tunnel protocol implementation. The main ones are highlighted below. setup_udp_tunnel_sock() configures UDP listener socket for receiving UDP encapsulated packets. udp_tunnel_xmit_skb() and upd_tunnel6_xmit_skb() transmit skb using UDP encapsulation. udp_tunnel_sock_release() closes the UDP tunnel listener socket. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel.c | 53 ++++++++++++++++++++++++++++++++++++++++++- net/ipv6/ip6_udp_tunnel.c | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 0b9e017c9038..a47790bcaa38 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -1,6 +1,14 @@ #ifndef __NET_UDP_TUNNEL_H #define __NET_UDP_TUNNEL_H +#include +#include + +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif + struct udp_port_cfg { u8 family; @@ -53,4 +61,53 @@ static inline int udp_sock_create(struct net *net, return -EPFNOSUPPORT; } +typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); +typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); + +struct udp_tunnel_sock_cfg { + void *sk_user_data; /* user data used by encap_rcv call back */ + /* Used for setting up udp_sock fields, see udp.h for details */ + __u8 encap_type; + udp_tunnel_encap_rcv_t encap_rcv; + udp_tunnel_encap_destroy_t encap_destroy; +}; + +/* Setup the given (UDP) sock to receive UDP encapsulated packets */ +void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *sock_cfg); + +/* Transmit the skb using UDP encapsulation. */ +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, + __be16 dst_port, bool xnet); + +#if IS_ENABLED(CONFIG_IPV6) +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, + struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, + __be16 dst_port); +#endif + +void udp_tunnel_sock_release(struct socket *sock); + +static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, + bool udp_csum) +{ + int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + return iptunnel_handle_offloads(skb, udp_csum, type); +} + +static inline void udp_tunnel_encap_enable(struct socket *sock) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sock->sk->sk_family == PF_INET6) + ipv6_stub->udpv6_encap_enable(); + else +#endif + udp_encap_enable(); +} + #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 7fccf6c3417a..1671263e5fa0 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -11,7 +11,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { - int err = -EINVAL; + int err; struct socket *sock = NULL; struct sockaddr_in udp_addr; @@ -54,4 +54,55 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *cfg) +{ + struct sock *sk = sock->sk; + + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + + /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ + udp_set_convert_csum(sk, true); + + rcu_assign_sk_user_data(sk, cfg->sk_user_data); + + udp_sk(sk)->encap_type = cfg->encap_type; + udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_destroy = cfg->encap_destroy; + + udp_tunnel_encap_enable(sock); +} +EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); + +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, + __be16 dst_port, bool xnet) +{ + struct udphdr *uh; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + uh->len = htons(skb->len); + + udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); + + return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, + tos, ttl, df, xnet); +} +EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); + +void udp_tunnel_sock_release(struct socket *sock) +{ + rcu_assign_sk_user_data(sock->sk, NULL); + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); +} +EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); + MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index bcfbb4b496fc..cbc990712cc1 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -61,3 +61,45 @@ error: return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); + +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, + struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port) +{ + struct udphdr *uh; + struct ipv6hdr *ip6h; + struct sock *sk = sock->sk; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst); + + udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, + &sk->sk_v6_daddr, skb->len); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + ip6tunnel_xmit(skb, dev); + return 0; +} +EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); -- cgit v1.2.3 From c8fffcea0a079f933b4e98adf9ebaa384dc943b6 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 16 Sep 2014 17:31:19 -0700 Subject: l2tp: Refactor l2tp core driver to make use of the common UDP tunnel functions Simplify l2tp implementation using common UDP tunnel APIs. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 2aa2b6c15f20..895348e44c7d 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1392,8 +1392,6 @@ static int l2tp_tunnel_sock_create(struct net *net, if (err < 0) goto out; - udp_set_convert_csum(sock->sk, true); - break; case L2TP_ENCAPTYPE_IP: @@ -1584,19 +1582,17 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; if (encap == L2TP_ENCAPTYPE_UDP) { - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ - udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP; - udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv; - udp_sk(sk)->encap_destroy = l2tp_udp_encap_destroy; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) - udpv6_encap_enable(); - else -#endif - udp_encap_enable(); - } + struct udp_tunnel_sock_cfg udp_cfg; + + udp_cfg.sk_user_data = tunnel; + udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP; + udp_cfg.encap_rcv = l2tp_udp_encap_recv; + udp_cfg.encap_destroy = l2tp_udp_encap_destroy; - sk->sk_user_data = tunnel; + setup_udp_tunnel_sock(net, sock, &udp_cfg); + } else { + sk->sk_user_data = tunnel; + } /* Hook on the tunnel socket destructor so that we can cleanup * if the tunnel socket goes away. -- cgit v1.2.3 From cb93471acc42b71fa3f2e46805020f2b323db64f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2014 03:14:42 -0700 Subject: tcp: do not fake tcp headers in tcp_send_rcvq() Now we no longer rely on having tcp headers for skbs in receive queue, tcp repair do not need to build fake ones. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea92f23ffaf1..02fb66d4a018 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4304,24 +4304,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb = NULL; - struct tcphdr *th; + struct sk_buff *skb; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); + skb = alloc_skb(size, sk->sk_allocation); if (!skb) goto err; - if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - th = (struct tcphdr *)skb_put(skb, sizeof(*th)); - skb_reset_transport_header(skb); - memset(th, 0, sizeof(*th)); - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) goto err_free; @@ -4329,7 +4324,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } -- cgit v1.2.3 From 2e4e44107176d552f8bb1bb76053e850e3809841 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2014 04:49:49 -0700 Subject: net: add alloc_skb_with_frags() helper Extract from sock_alloc_send_pskb() code building skb with frags, so that we can reuse this in other contexts. Intent is to use it from tcp_send_rcvq(), tcp_collapse(), ... We also want to replace some skb_linearize() calls to a more reliable strategy in pathological cases where we need to reduce number of frags. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++ net/core/skbuff.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 78 ++++++++++---------------------------------------- 3 files changed, 99 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 756e3d057e84..f1bfa3781c75 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -769,6 +769,12 @@ static inline struct sk_buff *alloc_skb(unsigned int size, return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask); + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 29f7f0121491..06a8feb10099 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4102,3 +4102,81 @@ err_free: return NULL; } EXPORT_SYMBOL(skb_vlan_untag); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * header_len: size of linear part + * data_len: needed length in frags + * max_page_order: max page order desired. + * errcode: pointer to error code if any + * gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + gfp_t gfp_head; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + gfp_head = gfp_mask; + if (gfp_head & __GFP_WAIT) + gfp_head |= __GFP_REPEAT; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_head); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(gfp_mask | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); diff --git a/net/core/sock.c b/net/core/sock.c index 6f436b5e4961..de887c45c63b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1762,21 +1762,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { - struct sk_buff *skb = NULL; - unsigned long chunk; - gfp_t gfp_mask; + struct sk_buff *skb; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - struct page *page; - int i; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; timeo = sock_sndtimeo(sk, noblock); - while (!skb) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; @@ -1785,66 +1776,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); - continue; - } - - err = -ENOBUFS; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; + if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + break; - skb = alloc_skb(header_len, gfp_mask); - if (!skb) + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) goto failure; - - skb->truesize += data_len; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages(sk->sk_allocation | - __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } - order--; - } - page = alloc_page(sk->sk_allocation); - if (!page) - goto failure; -fill_page: - chunk = min_t(unsigned long, data_len, - PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); - data_len -= chunk; - npages -= 1 << order; - } + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: - kfree_skb(skb); *errcode = err; return NULL; } -- cgit v1.2.3 From 6819563e646a7f3692836daefd12cd86c697759f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Sep 2014 13:07:54 -0700 Subject: net: dsa: allow switch drivers to specify phy_device::dev_flags Some switch drivers (e.g: bcm_sf2) may have to communicate specific workarounds or flags towards the PHY device driver. Allow switches driver to be delegated that task by introducing a get_phy_flags() callback which will do just that. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/slave.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index c779e9bba1b3..e020b8a12b7d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -181,6 +181,7 @@ struct dsa_switch_driver { char *(*probe)(struct device *host_dev, int sw_addr); int (*setup)(struct dsa_switch *ds); int (*set_addr)(struct dsa_switch *ds, u8 *addr); + u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 90c9689ed362..a7997265019a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -371,6 +371,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, struct dsa_chip_data *cd = ds->pd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; + u32 phy_flags = 0; int ret; port_dn = cd->port_dn[p->port]; @@ -390,9 +391,12 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_dn = port_dn; } + if (ds->drv->get_phy_flags) + phy_flags = ds->drv->get_phy_flags(ds, p->port); + if (phy_dn) p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, 0, + dsa_slave_adjust_link, phy_flags, p->phy_interface); if (p->phy && phy_is_fixed) @@ -480,6 +484,9 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, netif_carrier_off(slave_dev); if (p->phy != NULL) { + if (ds->drv->get_phy_flags(ds, port)) + p->phy->dev_flags |= ds->drv->get_phy_flags(ds, port); + phy_attach(slave_dev, dev_name(&p->phy->dev), PHY_INTERFACE_MODE_GMII); -- cgit v1.2.3 From ab34f6480806263d7b4d00fa06d3647bac73b68c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2014 08:05:05 -0700 Subject: net: sched: use __skb_queue_head_init() where applicable pfifo_fast and htb use skb lists, without needing their spinlocks. (They instead use the standard qdisc lock) We can use __skb_queue_head_init() instead of skb_queue_head_init() to be consistent. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 2 +- net/sched/sch_htb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 346ef85617d3..11b28f651ad1 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -523,7 +523,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - skb_queue_head_init(band2list(priv, prio)); + __skb_queue_head_init(band2list(priv, prio)); /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 6d16b9b81c28..14408f262143 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1044,7 +1044,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); - skb_queue_head_init(&q->direct_queue); + __skb_queue_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); -- cgit v1.2.3 From 4e2840eee6b21cb5230bd7cac8407badb201aac3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 17 Sep 2014 11:11:46 -0700 Subject: net: sched: cls_u32: rcu can not be last node tc_u32_sel 'sel' in tc_u_knode expects to be the last element in the structure and pads the structure with tc_u32_key fields for each key. kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL) CC: Eric Dumazet Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index eceeb0456d26..730edb29d43b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -64,8 +64,11 @@ struct tc_u_knode { u32 __percpu *pcpu_success; #endif struct tcf_proto *tp; - struct tc_u32_sel sel; struct rcu_head rcu; + /* The 'sel' field MUST be the last field in structure to allow for + * tc_u32_keys allocated at end of structure. + */ + struct tc_u32_sel sel; }; struct tc_u_hnode { -- cgit v1.2.3 From ce3e02867ed8e12c6e6e83a793d273c1f4d929ea Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:55 -0700 Subject: net: Export inet_offloads and inet6_offloads Want to be able to use these in foo-over-udp offloads, etc. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/protocol.c | 1 + net/ipv6/protocol.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 46d6a1c923a8..4b7c0ec65251 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -30,6 +30,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_offloads); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index e048cf1bb6a2..e3770abe688a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL(inet6_del_protocol); #endif const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet6_offloads); int inet6_add_offload(const struct net_offload *prot, unsigned char protocol) { -- cgit v1.2.3 From 23461551c00628c3f3fe9cf837bf53cf8f212b63 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:56 -0700 Subject: fou: Support for foo-over-udp RX path This patch provides a receive path for foo-over-udp. This allows direct encapsulation of IP protocols over UDP. The bound destination port is used to map to an IP protocol, and the XFRM framework (udp_encap_rcv) is used to receive encapsulated packets. Upon reception, the encapsulation header is logically removed (pointer to transport header is advanced) and the packet is reinjected into the receive path with the IP protocol indicated by the mapping. Netlink is used to configure FOU ports. The configuration information includes the port number to bind to and the IP protocol corresponding to that port. This should support GRE/UDP (http://tools.ietf.org/html/draft-yong-tsvwg-gre-in-udp-encap-02), as will as the other IP tunneling protocols (IPIP, SIT). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 32 ++++++ net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/fou.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 322 insertions(+) create mode 100644 include/uapi/linux/fou.h create mode 100644 net/ipv4/fou.c (limited to 'net') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h new file mode 100644 index 000000000000..e03376de453d --- /dev/null +++ b/include/uapi/linux/fou.h @@ -0,0 +1,32 @@ +/* fou.h - FOU Interface */ + +#ifndef _UAPI_LINUX_FOU_H +#define _UAPI_LINUX_FOU_H + +/* NETLINK_GENERIC related info + */ +#define FOU_GENL_NAME "fou" +#define FOU_GENL_VERSION 0x1 + +enum { + FOU_ATTR_UNSPEC, + FOU_ATTR_PORT, /* u16 */ + FOU_ATTR_AF, /* u8 */ + FOU_ATTR_IPPROTO, /* u8 */ + + __FOU_ATTR_MAX, +}; + +#define FOU_ATTR_MAX (__FOU_ATTR_MAX - 1) + +enum { + FOU_CMD_UNSPEC, + FOU_CMD_ADD, + FOU_CMD_DEL, + + __FOU_CMD_MAX, +}; + +#define FOU_CMD_MAX (__FOU_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_FOU_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index dbc10d84161f..84f710b7472a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -311,6 +311,16 @@ config NET_UDP_TUNNEL tristate default n +config NET_FOU + tristate "IP: Foo (IP protocols) over UDP" + select XFRM + select NET_UDP_TUNNEL + ---help--- + Foo over UDP allows any IP protocol to be directly encapsulated + over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP + network mechanisms and optimizations for UDP (such as ECMP + and RSS) can be leveraged to provide better service. + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 8ee1cd4053ee..d78d404c596f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o +obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c new file mode 100644 index 000000000000..d44f97b79418 --- /dev/null +++ b/net/ipv4/fou.c @@ -0,0 +1,279 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(fou_lock); +static LIST_HEAD(fou_list); + +struct fou { + struct socket *sock; + u8 protocol; + u16 port; + struct list_head list; +}; + +struct fou_cfg { + u8 protocol; + struct udp_port_cfg udp_config; +}; + +static inline struct fou *fou_from_sock(struct sock *sk) +{ + return sk->sk_user_data; +} + +static int fou_udp_encap_recv_deliver(struct sk_buff *skb, + u8 protocol, size_t len) +{ + struct iphdr *iph = ip_hdr(skb); + + /* Remove 'len' bytes from the packet (UDP header and + * FOU header if present), modify the protocol to the one + * we found, and then call rcv_encap. + */ + iph->tot_len = htons(ntohs(iph->tot_len) - len); + __skb_pull(skb, len); + skb_postpull_rcsum(skb, udp_hdr(skb), len); + skb_reset_transport_header(skb); + + return -protocol; +} + +static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + + if (!fou) + return 1; + + return fou_udp_encap_recv_deliver(skb, fou->protocol, + sizeof(struct udphdr)); +} + +static int fou_add_to_port_list(struct fou *fou) +{ + struct fou *fout; + + spin_lock(&fou_lock); + list_for_each_entry(fout, &fou_list, list) { + if (fou->port == fout->port) { + spin_unlock(&fou_lock); + return -EALREADY; + } + } + + list_add(&fou->list, &fou_list); + spin_unlock(&fou_lock); + + return 0; +} + +static void fou_release(struct fou *fou) +{ + struct socket *sock = fou->sock; + struct sock *sk = sock->sk; + + udp_del_offload(&fou->udp_offloads); + + list_del(&fou->list); + + /* Remove hooks into tunnel socket */ + sk->sk_user_data = NULL; + + sock_release(sock); + + kfree(fou); +} + +static int fou_create(struct net *net, struct fou_cfg *cfg, + struct socket **sockp) +{ + struct fou *fou = NULL; + int err; + struct socket *sock = NULL; + struct sock *sk; + + /* Open UDP socket */ + err = udp_sock_create(net, &cfg->udp_config, &sock); + if (err < 0) + goto error; + + /* Allocate FOU port structure */ + fou = kzalloc(sizeof(*fou), GFP_KERNEL); + if (!fou) { + err = -ENOMEM; + goto error; + } + + sk = sock->sk; + + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ + fou->protocol = cfg->protocol; + fou->port = cfg->udp_config.local_udp_port; + udp_sk(sk)->encap_rcv = fou_udp_recv; + + udp_sk(sk)->encap_type = 1; + udp_encap_enable(); + + sk->sk_user_data = fou; + fou->sock = sock; + + udp_set_convert_csum(sk, true); + + sk->sk_allocation = GFP_ATOMIC; + + err = fou_add_to_port_list(fou); + if (err) + goto error; + + if (sockp) + *sockp = sock; + + return 0; + +error: + kfree(fou); + if (sock) + sock_release(sock); + + return err; +} + +static int fou_destroy(struct net *net, struct fou_cfg *cfg) +{ + struct fou *fou; + u16 port = cfg->udp_config.local_udp_port; + int err = -EINVAL; + + spin_lock(&fou_lock); + list_for_each_entry(fou, &fou_list, list) { + if (fou->port == port) { + fou_release(fou); + err = 0; + break; + } + } + spin_unlock(&fou_lock); + + return err; +} + +static struct genl_family fou_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, +}; + +static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, +}; + +static int parse_nl_config(struct genl_info *info, + struct fou_cfg *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->udp_config.family = AF_INET; + + if (info->attrs[FOU_ATTR_AF]) { + u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); + + if (family != AF_INET && family != AF_INET6) + return -EINVAL; + + cfg->udp_config.family = family; + } + + if (info->attrs[FOU_ATTR_PORT]) { + u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); + + cfg->udp_config.local_udp_port = port; + } + + if (info->attrs[FOU_ATTR_IPPROTO]) + cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + + return 0; +} + +static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + int err; + + err = parse_nl_config(info, &cfg); + if (err) + return err; + + return fou_create(&init_net, &cfg, NULL); +} + +static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + + parse_nl_config(info, &cfg); + + return fou_destroy(&init_net, &cfg); +} + +static const struct genl_ops fou_nl_ops[] = { + { + .cmd = FOU_CMD_ADD, + .doit = fou_nl_cmd_add_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_DEL, + .doit = fou_nl_cmd_rm_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static int __init fou_init(void) +{ + int ret; + + ret = genl_register_family_with_ops(&fou_nl_family, + fou_nl_ops); + + return ret; +} + +static void __exit fou_fini(void) +{ + struct fou *fou, *next; + + genl_unregister_family(&fou_nl_family); + + /* Close all the FOU sockets */ + + spin_lock(&fou_lock); + list_for_each_entry_safe(fou, next, &fou_list, list) + fou_release(fou); + spin_unlock(&fou_lock); +} + +module_init(fou_init); +module_exit(fou_fini); +MODULE_AUTHOR("Tom Herbert "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From afe93325bc02a5b2dea0cd7d78225de692265e6e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:57 -0700 Subject: fou: Add GRO support Implement fou_gro_receive and fou_gro_complete, and populate these in the correponsing udp_offloads for the socket. Added ipproto to udp_offloads and pass this from UDP to the fou GRO routine in proto field of napi_gro_cb structure. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +- net/ipv4/fou.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp_offload.c | 5 ++- 3 files changed, 95 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28d4378615e5..4354b4307e37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1874,7 +1874,7 @@ struct napi_gro_cb { /* jiffies when first packet was created/queued */ unsigned long age; - /* Used in ipv6_gro_receive() */ + /* Used in ipv6_gro_receive() and foo-over-udp */ u16 proto; /* Used in udp_gro_receive */ @@ -1925,6 +1925,7 @@ struct packet_offload { struct udp_offload { __be16 port; + u8 ipproto; struct offload_callbacks callbacks; }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d44f97b79418..dced89fbe480 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ struct fou { struct socket *sock; u8 protocol; u16 port; + struct udp_offload udp_offloads; struct list_head list; }; @@ -62,6 +64,69 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) sizeof(struct udphdr)); } +static struct sk_buff **fou_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + const struct net_offload **offloads) +{ + const struct net_offload *ops; + struct sk_buff **pp = NULL; + u8 proto = NAPI_GRO_CB(skb)->proto; + + rcu_read_lock(); + ops = rcu_dereference(offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + + return pp; +} + +static int fou_gro_complete(struct sk_buff *skb, int nhoff, + const struct net_offload **offloads) +{ + const struct net_offload *ops; + u8 proto = NAPI_GRO_CB(skb)->proto; + int err = -ENOSYS; + + rcu_read_lock(); + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct sk_buff **fou4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + return fou_gro_receive(head, skb, inet_offloads); +} + +static int fou4_gro_complete(struct sk_buff *skb, int nhoff) +{ + return fou_gro_complete(skb, nhoff, inet_offloads); +} + +static struct sk_buff **fou6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + return fou_gro_receive(head, skb, inet6_offloads); +} + +static int fou6_gro_complete(struct sk_buff *skb, int nhoff) +{ + return fou_gro_complete(skb, nhoff, inet6_offloads); +} + static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -134,6 +199,29 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; + switch (cfg->udp_config.family) { + case AF_INET: + fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete; + break; + case AF_INET6: + fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete; + break; + default: + err = -EPFNOSUPPORT; + goto error; + } + + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + if (cfg->udp_config.family == AF_INET) { + err = udp_add_offload(&fou->udp_offloads); + if (err) + goto error; + } + err = fou_add_to_port_list(fou); if (err) goto error; @@ -160,6 +248,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) spin_lock(&fou_lock); list_for_each_entry(fou, &fou_list, list) { if (fou->port == port) { + udp_del_offload(&fou->udp_offloads); fou_release(fou); err = 0; break; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index adab393b2fe5..d7c43f764c71 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -276,6 +276,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; pp = uo_priv->offload->callbacks.gro_receive(head, skb); out_unlock: @@ -329,8 +330,10 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) break; } - if (uo_priv != NULL) + if (uo_priv != NULL) { + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); return err; -- cgit v1.2.3 From 56328486539ddd07cbaafec7a542a2c8a3043623 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:58 -0700 Subject: net: Changes to ip_tunnel to support foo-over-udp encapsulation This patch changes IP tunnel to support (secondary) encapsulation, Foo-over-UDP. Changes include: 1) Adding tun_hlen as the tunnel header length, encap_hlen as the encapsulation header length, and hlen becomes the grand total of these. 2) Added common netlink define to support FOU encapsulation. 3) Routines to perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 19 ++++++++- include/uapi/linux/if_tunnel.h | 12 ++++++ net/ipv4/ip_tunnel.c | 91 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 8dd8cab88b87..7f538ba6e267 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) @@ -31,6 +32,13 @@ struct ip_tunnel_6rd_parm { }; #endif +struct ip_tunnel_encap { + __u16 type; + __u16 flags; + __be16 sport; + __be16 dport; +}; + struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; @@ -56,13 +64,18 @@ struct ip_tunnel { /* These four fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ __u32 o_seqno; /* The last output seqno */ - int hlen; /* Precalculated header length */ + int tun_hlen; /* Precalculated header length */ int mlink; struct ip_tunnel_dst __percpu *dst_cache; struct ip_tunnel_parm parms; + int encap_hlen; /* Encap header length (FOU,GUE) */ + struct ip_tunnel_encap encap; + + int hlen; /* tun_hlen + encap_hlen */ + /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; @@ -114,6 +127,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, @@ -131,6 +146,8 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); void ip_tunnel_setup(struct net_device *dev, int net_id); void ip_tunnel_dst_reset_all(struct ip_tunnel *t); +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap); /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 3bce9e9d9f7c..9fedca7d8dae 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -53,10 +53,22 @@ enum { IFLA_IPTUN_6RD_RELAY_PREFIX, IFLA_IPTUN_6RD_PREFIXLEN, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + IFLA_IPTUN_ENCAP_TYPE, + IFLA_IPTUN_ENCAP_FLAGS, + IFLA_IPTUN_ENCAP_SPORT, + IFLA_IPTUN_ENCAP_DPORT, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) +enum tunnel_encap_types { + TUNNEL_ENCAP_NONE, + TUNNEL_ENCAP_FOU, +}; + +#define TUNNEL_ENCAP_FLAG_CSUM (1<<0) +#define TUNNEL_ENCAP_FLAG_CSUM6 (1<<1) + /* SIT-mode i_flags */ #define SIT_ISATAP 0x0001 diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index afed1aac2638..e3a3dc91e49c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -55,6 +55,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -487,6 +488,91 @@ drop: } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); +static int ip_encap_hlen(struct ip_tunnel_encap *e) +{ + switch (e->type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return sizeof(struct udphdr); + default: + return -EINVAL; + } +} + +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap) +{ + int hlen; + + memset(&t->encap, 0, sizeof(t->encap)); + + hlen = ip_encap_hlen(ipencap); + if (hlen < 0) + return hlen; + + t->encap.type = ipencap->type; + t->encap.sport = ipencap->sport; + t->encap.dport = ipencap->dport; + t->encap.flags = ipencap->flags; + + t->encap_hlen = hlen; + t->hlen = t->encap_hlen + t->tun_hlen; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + size_t hdr_len, u8 *protocol, struct flowi4 *fl4) +{ + struct udphdr *uh; + __be16 sport; + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get length and hash before making space in skb */ + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; + + return 0; +} + +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4) +{ + switch (t->encap.type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return fou_build_header(skb, &t->encap, t->encap_hlen, + protocol, fl4); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(ip_tunnel_encap); + static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df) { @@ -536,7 +622,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, } void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, const u8 protocol) + const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; @@ -617,6 +703,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) + goto tx_error; + rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; if (!rt) { -- cgit v1.2.3 From 14909664e4e192f4c6f6fcdccd9919af7cf783ab Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:59 -0700 Subject: sit: Setup and TX path for sit/UDP foo-over-udp encapsulation Added netlink handling of IP tunnel encapulation paramters, properly adjust MTU for encapsulation. Added ip_tunnel_encap call to ipip6_tunnel_xmit to actually perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/sit.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 86e3fa81f85e..db75809ab843 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -822,6 +822,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, int addr_type; u8 ttl; int err; + u8 protocol = IPPROTO_IPV6; + int t_hlen = tunnel->hlen + sizeof(struct iphdr); if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; @@ -911,8 +913,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + if (IS_ERR(skb)) { + ip_rt_put(rt); + goto out; + } + if (df) { - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - t_hlen; if (mtu < 68) { dev->stats.collisions++; @@ -947,7 +955,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, /* * Okay, now see if we can stuff it in the buffer as-is. */ - max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { @@ -969,14 +977,13 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); - if (IS_ERR(skb)) { + if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) { ip_rt_put(rt); - goto out; + goto tx_error; } err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, - IPPROTO_IPV6, tos, ttl, df, + protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -1059,8 +1066,10 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); + dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } @@ -1307,7 +1316,10 @@ done: static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) { - if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - t_hlen) return -EINVAL; dev->mtu = new_mtu; return 0; @@ -1338,12 +1350,15 @@ static void ipip6_dev_free(struct net_device *dev) static void ipip6_tunnel_setup(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + dev->netdev_ops = &ipip6_netdev_ops; dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->iflink = 0; @@ -1466,6 +1481,40 @@ static void ipip6_netlink_parms(struct nlattr *data[], } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipip6_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); + } + + if (data[IFLA_IPTUN_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); + } + + if (data[IFLA_IPTUN_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); + } + + if (data[IFLA_IPTUN_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); + } + + return ret; +} + #ifdef CONFIG_IPV6_SIT_6RD /* This function returns true when 6RD attributes are present in the nl msg */ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], @@ -1509,12 +1558,20 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, { struct net *net = dev_net(dev); struct ip_tunnel *nt; + struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif int err; nt = netdev_priv(dev); + + if (ipip6_netlink_encap_parms(data, &ipencap)) { + err = ip_tunnel_encap_setup(nt, &ipencap); + if (err < 0) + return err; + } + ipip6_netlink_parms(data, &nt->parms); if (ipip6_tunnel_locate(net, &nt->parms, 0)) @@ -1537,15 +1594,23 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + int err; if (dev == sitn->fb_tunnel_dev) return -EINVAL; + if (ipip6_netlink_encap_parms(data, &ipencap)) { + err = ip_tunnel_encap_setup(t, &ipencap); + if (err < 0) + return err; + } + ipip6_netlink_parms(data, &p); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || @@ -1599,6 +1664,14 @@ static size_t ipip6_get_size(const struct net_device *dev) /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ nla_total_size(2) + #endif + /* IFLA_IPTUN_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -1630,6 +1703,16 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #endif + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, + tunnel->encap.type) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, + tunnel->encap.sport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, + tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, + tunnel->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1651,6 +1734,10 @@ static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, #endif + [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, }; static void ipip6_dellink(struct net_device *dev, struct list_head *head) -- cgit v1.2.3 From 473ab820dd4af588785a8e10b9c1547aadb4fd72 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:26:00 -0700 Subject: ipip: Setup and TX path for ipip/UDP foo-over-udp encapsulation Add netlink handling for IP tunnel encapsulation parameters and and adjustment of MTU for encapsulation. ip_tunnel_encap is called from ip_tunnel_xmit to actually perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 62eaa005e146..bfec31df8b21 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -301,7 +301,8 @@ static int ipip_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - tunnel->hlen = 0; + tunnel->tun_hlen = 0; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; tunnel->parms.iph.protocol = IPPROTO_IPIP; return ip_tunnel_init(dev); } @@ -340,10 +341,53 @@ static void ipip_netlink_parms(struct nlattr *data[], parms->iph.frag_off = htons(IP_DF); } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipip_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); + } + + if (data[IFLA_IPTUN_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); + } + + if (data[IFLA_IPTUN_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); + } + + if (data[IFLA_IPTUN_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); + } + + return ret; +} + static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipip_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipip_netlink_parms(data, &p); return ip_tunnel_newlink(dev, tb, &p); @@ -353,6 +397,15 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipip_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipip_netlink_parms(data, &p); @@ -378,6 +431,14 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + + /* IFLA_IPTUN_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -394,6 +455,17 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, + tunnel->encap.type) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, + tunnel->encap.sport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, + tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, + tunnel->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -407,6 +479,10 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { -- cgit v1.2.3 From 4565e9919cda747815547e2e5d7b78f15efbffdf Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:26:01 -0700 Subject: gre: Setup and TX path for gre/UDP foo-over-udp encapsulation Added netlink attrs to configure FOU encapsulation for GRE, netlink handling of these flags, and properly adjust MTU for encapsulation. ip_tunnel_encap is called from ip_tunnel_xmit to actually perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 4 ++ net/ipv4/ip_gre.c | 90 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 9fedca7d8dae..7c832afdfa94 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -106,6 +106,10 @@ enum { IFLA_GRE_ENCAP_LIMIT, IFLA_GRE_FLOWINFO, IFLA_GRE_FLAGS, + IFLA_GRE_ENCAP_TYPE, + IFLA_GRE_ENCAP_FLAGS, + IFLA_GRE_ENCAP_SPORT, + IFLA_GRE_ENCAP_DPORT, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9b842544aea3..829aff8bf723 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -239,7 +239,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->hlen); + gre_build_header(skb, &tpi, tunnel->tun_hlen); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -310,7 +310,7 @@ out: static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err = 0; + int err; struct ip_tunnel_parm p; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) @@ -470,13 +470,18 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; + int t_hlen; tunnel = netdev_priv(dev); - tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + + t_hlen = tunnel->hlen + sizeof(struct iphdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -628,6 +633,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->iph.frag_off = htons(IP_DF); } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipgre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; +} + static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); @@ -657,6 +696,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); @@ -666,6 +714,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); @@ -694,6 +751,14 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_PMTUDISC */ nla_total_size(1) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -714,6 +779,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, + t->encap.sport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, + t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -731,6 +807,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { -- cgit v1.2.3 From 6d967f8789249628a6388a3a4314c5fef423f36a Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 19 Sep 2014 18:02:53 -0700 Subject: udp_tunnel: Only build ip6_udp_tunnel.c when IPV6 is selected Functions supplied in ip6_udp_tunnel.c are only needed when IPV6 is selected. When IPV6 is not selected, those functions are stubbed out in udp_tunnel.h. ================================================================== net/ipv6/ip6_udp_tunnel.c:15:5: error: redefinition of 'udp_sock_create6' int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, In file included from net/ipv6/ip6_udp_tunnel.c:9:0: include/net/udp_tunnel.h:36:19: note: previous definition of 'udp_sock_create6' was here static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, ================================================================== Fixes: fd384412e udp_tunnel: Seperate ipv6 functions into its own file Reported-by: kbuild test robot Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/ipv6/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 45f830e5f820..2e8c06108ab9 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -35,7 +35,6 @@ obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_NETFILTER) += netfilter/ -obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_SIT) += sit.o @@ -46,3 +45,7 @@ obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o + +ifneq ($(CONFIG_IPV6),) +obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +endif -- cgit v1.2.3 From 2446254915a7d6f08bba9a755a34cc0402880472 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Sep 2014 17:31:22 -0700 Subject: net: dsa: allow switch drivers to implement suspend/resume hooks Add an abstraction layer to suspend/resume switch devices, doing the following split: - suspend/resume the slave network devices and their corresponding PHY devices - suspend/resume the switch hardware using switch driver callbacks Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++ net/dsa/dsa.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa_priv.h | 2 ++ net/dsa/slave.c | 31 +++++++++++++++++++++ 4 files changed, 119 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index e020b8a12b7d..846dce4abaa5 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -210,6 +210,12 @@ struct dsa_switch_driver { void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds); + + /* + * Suspend and resume + */ + int (*suspend)(struct dsa_switch *ds); + int (*resume)(struct dsa_switch *ds); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6e40928ec0e7..6905f2d84c44 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -238,6 +238,49 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { } +static int dsa_switch_suspend(struct dsa_switch *ds) +{ + int i, ret = 0; + + /* Suspend slave network devices */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!(ds->phys_port_mask & (1 << i))) + continue; + + ret = dsa_slave_suspend(ds->ports[i]); + if (ret) + return ret; + } + + if (ds->drv->suspend) + ret = ds->drv->suspend(ds); + + return ret; +} + +static int dsa_switch_resume(struct dsa_switch *ds) +{ + int i, ret = 0; + + if (ds->drv->resume) + ret = ds->drv->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!(ds->phys_port_mask & (1 << i))) + continue; + + ret = dsa_slave_resume(ds->ports[i]); + if (ret) + return ret; + } + + return 0; +} + /* link polling *************************************************************/ static void dsa_link_poll_work(struct work_struct *ugly) @@ -650,6 +693,42 @@ static struct packet_type dsa_pack_type __read_mostly = { .func = dsa_switch_rcv, }; +#ifdef CONFIG_PM_SLEEP +static int dsa_suspend(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_suspend(ds); + } + + return ret; +} + +static int dsa_resume(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_resume(ds); + } + + return ret; +} +#endif + +static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); + static const struct of_device_id dsa_of_match_table[] = { { .compatible = "brcm,bcm7445-switch-v4.0" }, { .compatible = "marvell,dsa", }, @@ -665,6 +744,7 @@ static struct platform_driver dsa_driver = { .name = "dsa", .owner = THIS_MODULE, .of_match_table = dsa_of_match_table, + .pm = &dsa_pm_ops, }, }; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f90899e8ab5a..dc9756d3154c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -56,6 +56,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds); struct net_device *dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name); +int dsa_slave_suspend(struct net_device *slave_dev); +int dsa_slave_resume(struct net_device *slave_dev); /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a7997265019a..143811ef57ae 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -412,6 +412,37 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, p->phy->addr, p->phy->drv->name); } +int dsa_slave_suspend(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_device_detach(slave_dev); + + if (p->phy) { + phy_stop(p->phy); + p->old_pause = -1; + p->old_link = -1; + p->old_duplex = -1; + phy_suspend(p->phy); + } + + return 0; +} + +int dsa_slave_resume(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_device_attach(slave_dev); + + if (p->phy) { + phy_resume(p->phy); + phy_start(p->phy); + } + + return 0; +} + struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) -- cgit v1.2.3 From 19e57c4e6dc6b82a3204b801f4c5f27c7d007559 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Sep 2014 17:31:24 -0700 Subject: net: dsa: add {get, set}_wol callbacks to slave devices Allow switch drivers to implement per-port Wake-on-LAN getter and setters. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ net/dsa/slave.c | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 846dce4abaa5..d8054fb4a4df 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -211,6 +211,14 @@ struct dsa_switch_driver { int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds); + /* + * ethtool Wake-on-LAN + */ + void (*get_wol)(struct dsa_switch *ds, int port, + struct ethtool_wolinfo *w); + int (*set_wol)(struct dsa_switch *ds, int port, + struct ethtool_wolinfo *w); + /* * Suspend and resume */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 143811ef57ae..43c1e4ade689 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -301,6 +301,27 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) return -EOPNOTSUPP; } +static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->get_wol) + ds->drv->get_wol(ds, p->port, w); +} + +static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->set_wol) + ret = ds->drv->set_wol(ds, p->port, w); + + return ret; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -310,6 +331,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, + .set_wol = dsa_slave_set_wol, + .get_wol = dsa_slave_get_wol, }; static const struct net_device_ops dsa_slave_netdev_ops = { -- cgit v1.2.3 From cecda693a969816bac5e470e1d9c9c0ef5567bca Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 19 Sep 2014 16:04:38 +0800 Subject: net: keep original skb which only needs header checking during software GSO Commit ce93718fb7cdbc064c3000ff59e4d3200bdfa744 ("net: Don't keep around original SKB when we software segment GSO frames") frees the original skb after software GSO even for dodgy gso skbs. This breaks the stream throughput from untrusted sources, since only header checking was done during software GSO instead of a true segmentation. This patch fixes this by freeing the original gso skb only when it was really segmented by software. Fixes ce93718fb7cdbc064c3000ff59e4d3200bdfa744 ("net: Don't keep around original SKB when we software segment GSO frames.") Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Jason Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e916ba8caccf..52cd71a4a343 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2694,10 +2694,12 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) struct sk_buff *segs; segs = skb_gso_segment(skb, features); - kfree_skb(skb); - if (IS_ERR(segs)) + if (IS_ERR(segs)) { segs = NULL; - skb = segs; + } else if (segs) { + consume_skb(skb); + skb = segs; + } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) -- cgit v1.2.3 From 3fcb95a84fdb11e922cfac3b5cd3a9d96a9e9995 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 22 Sep 2014 11:39:44 -0700 Subject: udp: Need to make ip6_udp_tunnel.c have GPL license Unable to load various tunneling modules without this: [ 80.679049] fou: Unknown symbol udp_sock_create6 (err 0) [ 91.439939] ip6_udp_tunnel: Unknown symbol ip6_local_out (err 0) [ 91.439954] ip6_udp_tunnel: Unknown symbol __put_net (err 0) [ 91.457792] vxlan: Unknown symbol udp_sock_create6 (err 0) [ 91.457831] vxlan: Unknown symbol udp_tunnel6_xmit_skb (err 0) Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ip6_udp_tunnel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index cbc990712cc1..b04ed72c4542 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -103,3 +103,5 @@ int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From a1ddcfee2d9ae172d0095f3f8227f7fa53288c65 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Sep 2014 21:50:04 -0700 Subject: net: cls_u32: fix missed pcpu_success free_percpu This fixes a missed free_percpu in the unwind code path and when keys are destroyed. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 730edb29d43b..8d90e50a8ce4 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -362,6 +362,9 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF free_percpu(n->pf); +#endif +#ifdef CONFIG_CLS_U32_MARK + free_percpu(n->pcpu_success); #endif kfree(n); return 0; @@ -693,6 +696,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); + if (!n->pcpu_success) { + err = -ENOMEM; + goto errout; + } if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; @@ -720,6 +727,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long)n; return 0; } + +#ifdef CONFIG_CLS_U32_MARK + free_percpu(n->pcpu_success); +#endif + +errout: #ifdef CONFIG_CLS_U32_PERF free_percpu(n->pf); #endif -- cgit v1.2.3 From de5df63228fcfbd5bb7fd883774c18fec9e61f12 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Sep 2014 21:50:34 -0700 Subject: net: sched: cls_u32 changes to knode must appear atomic to readers Changes to the cls_u32 classifier must appear atomic to the readers. Before this patch if a change is requested for both the exts and ifindex, first the ifindex is updated then the exts with tcf_exts_change(). This opens a small window where a reader can have a exts chain with an incorrect ifindex. This violates the the RCU semantics. Here we resolve this by always passing u32_set_parms() a copy of the tc_u_knode to work on and then inserting it into the hash table after the updates have been successfully applied. Tested with the following short script: #tc filter add dev p3p2 parent 8001:0 protocol ip prio 99 handle 1: \ u32 divisor 256 #tc filter add dev p3p2 parent 8001:0 protocol ip prio 99 \ u32 link 1: hashkey mask ffffff00 at 12 \ match ip src 192.168.8.0/2 #tc filter add dev p3p2 parent 8001:0 protocol ip prio 102 \ handle 1::10 u32 classid 1:2 ht 1: \ match ip src 192.168.8.0/8 match ip tos 0x0a 1e #tc filter change dev p3p2 parent 8001:0 protocol ip prio 102 \ handle 1::10 u32 classid 1:2 ht 1: \ match ip src 1.1.0.0/8 match ip tos 0x0b 1e CC: Eric Dumazet CC: Jamal Hadi Salim Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 8d90e50a8ce4..e3fb5308d44a 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -354,27 +354,53 @@ static int u32_init(struct tcf_proto *tp) return 0; } -static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +static int u32_destroy_key(struct tcf_proto *tp, + struct tc_u_knode *n, + bool free_pf) { tcf_unbind_filter(tp, &n->res); tcf_exts_destroy(tp, &n->exts); if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - free_percpu(n->pf); + if (free_pf) + free_percpu(n->pf); #endif #ifdef CONFIG_CLS_U32_MARK - free_percpu(n->pcpu_success); + if (free_pf) + free_percpu(n->pcpu_success); #endif kfree(n); return 0; } +/* u32_delete_key_rcu should be called when free'ing a copied + * version of a tc_u_knode obtained from u32_init_knode(). When + * copies are obtained from u32_init_knode() the statistics are + * shared between the old and new copies to allow readers to + * continue to update the statistics during the copy. To support + * this the u32_delete_key_rcu variant does not free the percpu + * statistics. + */ static void u32_delete_key_rcu(struct rcu_head *rcu) { struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - u32_destroy_key(key->tp, key); + u32_destroy_key(key->tp, key, false); +} + +/* u32_delete_key_freepf_rcu is the rcu callback variant + * that free's the entire structure including the statistics + * percpu variables. Only use this if the key is not a copy + * returned by u32_init_knode(). See u32_delete_key_rcu() + * for the variant that should be used with keys return from + * u32_init_knode() + */ +static void u32_delete_key_freepf_rcu(struct rcu_head *rcu) +{ + struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); + + u32_destroy_key(key->tp, key, true); } static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) @@ -390,7 +416,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); - call_rcu(&key->rcu, u32_delete_key_rcu); + call_rcu(&key->rcu, u32_delete_key_freepf_rcu); return 0; } } @@ -408,7 +434,7 @@ static void u32_clear_hnode(struct tc_u_hnode *ht) while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); - call_rcu(&n->rcu, u32_delete_key_rcu); + call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } } @@ -584,6 +610,82 @@ errout: return err; } +static void u32_replace_knode(struct tcf_proto *tp, + struct tc_u_common *tp_c, + struct tc_u_knode *n) +{ + struct tc_u_knode __rcu **ins; + struct tc_u_knode *pins; + struct tc_u_hnode *ht; + + if (TC_U32_HTID(n->handle) == TC_U32_ROOT) + ht = rtnl_dereference(tp->root); + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle)); + + ins = &ht->ht[TC_U32_HASH(n->handle)]; + + /* The node must always exist for it to be replaced if this is not the + * case then something went very wrong elsewhere. + */ + for (pins = rtnl_dereference(*ins); ; + ins = &pins->next, pins = rtnl_dereference(*ins)) + if (pins->handle == n->handle) + break; + + RCU_INIT_POINTER(n->next, pins->next); + rcu_assign_pointer(*ins, n); +} + +static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, + struct tc_u_knode *n) +{ + struct tc_u_knode *new; + struct tc_u32_sel *s = &n->sel; + + new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), + GFP_KERNEL); + + if (!new) + return NULL; + + RCU_INIT_POINTER(new->next, n->next); + new->handle = n->handle; + RCU_INIT_POINTER(new->ht_up, n->ht_up); + +#ifdef CONFIG_NET_CLS_IND + new->ifindex = n->ifindex; +#endif + new->fshift = n->fshift; + new->res = n->res; + RCU_INIT_POINTER(new->ht_down, n->ht_down); + + /* bump reference count as long as we hold pointer to structure */ + if (new->ht_down) + new->ht_down->refcnt++; + +#ifdef CONFIG_CLS_U32_PERF + /* Statistics may be incremented by readers during update + * so we must keep them in tact. When the node is later destroyed + * a special destroy call must be made to not free the pf memory. + */ + new->pf = n->pf; +#endif + +#ifdef CONFIG_CLS_U32_MARK + new->val = n->val; + new->mask = n->mask; + /* Similarly success statistics must be moved as pointers */ + new->pcpu_success = n->pcpu_success; +#endif + new->tp = tp; + memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + + tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); + + return new; +} + static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -610,12 +712,27 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n = (struct tc_u_knode *)*arg; if (n) { + struct tc_u_knode *new; + if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(net, tp, base, - rtnl_dereference(n->ht_up), n, tb, - tca[TCA_RATE], ovr); + new = u32_init_knode(tp, n); + if (!new) + return -ENOMEM; + + err = u32_set_parms(net, tp, base, + rtnl_dereference(n->ht_up), new, tb, + tca[TCA_RATE], ovr); + + if (err) { + u32_destroy_key(tp, new, false); + return err; + } + + u32_replace_knode(tp, tp_c, new); + call_rcu(&n->rcu, u32_delete_key_rcu); + return 0; } if (tb[TCA_U32_DIVISOR]) { -- cgit v1.2.3 From 35f7aa5309c048bb70e58571942795fa9411ce6a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Sep 2014 14:03:55 +0200 Subject: ipv6: mld: answer mldv2 queries with mldv1 reports in mldv1 fallback RFC2710 (MLDv1), section 3.7. says: The length of a received MLD message is computed by taking the IPv6 Payload Length value and subtracting the length of any IPv6 extension headers present between the IPv6 header and the MLD message. If that length is greater than 24 octets, that indicates that there are other fields present *beyond* the fields described above, perhaps belonging to a *future backwards-compatible* version of MLD. An implementation of the version of MLD specified in this document *MUST NOT* send an MLD message longer than 24 octets and MUST ignore anything past the first 24 octets of a received MLD message. RFC3810 (MLDv2), section 8.2.1. states for *listeners* regarding presence of MLDv1 routers: In order to be compatible with MLDv1 routers, MLDv2 hosts MUST operate in version 1 compatibility mode. [...] When Host Compatibility Mode is MLDv2, a host acts using the MLDv2 protocol on that interface. When Host Compatibility Mode is MLDv1, a host acts in MLDv1 compatibility mode, using *only* the MLDv1 protocol, on that interface. [...] While section 8.3.1. specifies *router* behaviour regarding presence of MLDv1 routers: MLDv2 routers may be placed on a network where there is at least one MLDv1 router. The following requirements apply: If an MLDv1 router is present on the link, the Querier MUST use the *lowest* version of MLD present on the network. This must be administratively assured. Routers that desire to be compatible with MLDv1 MUST have a configuration option to act in MLDv1 mode; if an MLDv1 router is present on the link, the system administrator must explicitly configure all MLDv2 routers to act in MLDv1 mode. When in MLDv1 mode, the Querier MUST send periodic General Queries truncated at the Multicast Address field (i.e., 24 bytes long), and SHOULD also warn about receiving an MLDv2 Query (such warnings must be rate-limited). The Querier MUST also fill in the Maximum Response Delay in the Maximum Response Code field, i.e., the exponential algorithm described in section 5.1.3. is not used. [...] That means that we should not get queries from different versions of MLD. When there's a MLDv1 router present, MLDv2 enforces truncation and MRC == MRD (both fields are overlapping within the 24 octet range). Section 8.3.2. specifies behaviour in the presence of MLDv1 multicast address *listeners*: MLDv2 routers may be placed on a network where there are hosts that have not yet been upgraded to MLDv2. In order to be compatible with MLDv1 hosts, MLDv2 routers MUST operate in version 1 compatibility mode. MLDv2 routers keep a compatibility mode per multicast address record. The compatibility mode of a multicast address is determined from the Multicast Address Compatibility Mode variable, which can be in one of the two following states: MLDv1 or MLDv2. The Multicast Address Compatibility Mode of a multicast address record is set to MLDv1 whenever an MLDv1 Multicast Listener Report is *received* for that multicast address. At the same time, the Older Version Host Present timer for the multicast address is set to Older Version Host Present Timeout seconds. The timer is re-set whenever a new MLDv1 Report is received for that multicast address. If the Older Version Host Present timer expires, the router switches back to Multicast Address Compatibility Mode of MLDv2 for that multicast address. [...] That means, what can happen is the following scenario, that hosts can act in MLDv1 compatibility mode when they previously have received an MLDv1 query (or, simply operate in MLDv1 mode-only); and at the same time, an MLDv2 router could start up and transmits MLDv2 startup query messages while being unaware of the current operational mode. Given RFC2710, section 3.7 we would need to answer to that with an MLDv1 listener report, so that the router according to RFC3810, section 8.3.2. would receive that and internally switch to MLDv1 compatibility as well. Right now, I believe since the initial implementation of MLDv2, Linux hosts would just silently drop such MLDv2 queries instead of replying with an MLDv1 listener report, which would prevent a MLDv2 router going into fallback mode (until it receives other MLDv1 queries). Since the mapping of MRC to MRD in exactly such cases can make use of the exponential algorithm from 5.1.3, we cannot [strictly speaking] be aware in MLDv1 of the encoding in MRC, it seems also not mentioned by the RFC. Since encodings are the same up to 32767, assume in such a situation this value as a hard upper limit we would clamp. We have asked one of the RFC authors on that regard, and he mentioned that there seem not to be any implementations that make use of that exponential algorithm on startup messages. In any case, this patch fixes this MLD interoperability issue. Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/mld.h | 5 ++++- net/ipv6/mcast.c | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/mld.h b/include/net/mld.h index faa1d161bf24..01d751303498 100644 --- a/include/net/mld.h +++ b/include/net/mld.h @@ -88,12 +88,15 @@ struct mld2_query { #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) +#define MLD_EXP_MIN_LIMIT 32768UL +#define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) + static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); - if (mc_mrc < 32768) { + if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 592eba61e78a..9648de2b6745 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1237,7 +1237,7 @@ static void mld_update_qri(struct inet6_dev *idev, } static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, - unsigned long *max_delay) + unsigned long *max_delay, bool v1_query) { unsigned long mldv1_md; @@ -1245,11 +1245,32 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, if (mld_in_v2_mode_only(idev)) return -EINVAL; - /* MLDv1 router present */ mldv1_md = ntohs(mld->mld_maxdelay); + + /* When in MLDv1 fallback and a MLDv2 router start-up being + * unaware of current MLDv1 operation, the MRC == MRD mapping + * only works when the exponential algorithm is not being + * used (as MLDv1 is unaware of such things). + * + * According to the RFC author, the MLDv2 implementations + * he's aware of all use a MRC < 32768 on start up queries. + * + * Thus, should we *ever* encounter something else larger + * than that, just assume the maximum possible within our + * reach. + */ + if (!v1_query) + mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT); + *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); - mld_set_v1_mode(idev); + /* MLDv1 router present: we need to go into v1 mode *only* + * when an MLDv1 query is received as per section 9.12. of + * RFC3810! And we know from RFC2710 section 3.7 that MLDv1 + * queries MUST be of exactly 24 octets. + */ + if (v1_query) + mld_set_v1_mode(idev); /* cancel MLDv2 report timer */ mld_gq_stop_timer(idev); @@ -1264,10 +1285,6 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, unsigned long *max_delay) { - /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ - if (mld_in_v1_mode(idev)) - return -EINVAL; - *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); mld_update_qrv(idev, mld); @@ -1324,8 +1341,11 @@ int igmp6_event_query(struct sk_buff *skb) !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - if (len == MLD_V1_QUERY_LEN) { - err = mld_process_v1(idev, mld, &max_delay); + if (len < MLD_V1_QUERY_LEN) { + return -EINVAL; + } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { + err = mld_process_v1(idev, mld, &max_delay, + len == MLD_V1_QUERY_LEN); if (err < 0) return err; } else if (len >= MLD_V2_QUERY_LEN_MIN) { @@ -1357,8 +1377,9 @@ int igmp6_event_query(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } - } else + } else { return -EINVAL; + } read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { -- cgit v1.2.3 From fcdd1cf4dd63aecf86c987d7f4ec7187be5c2fbc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 13:19:44 -0700 Subject: tcp: avoid possible arithmetic overflows icsk_rto is a 32bit field, and icsk_backoff can reach 15 by default, or more if some sysctl (eg tcp_retries2) are changed. Better use 64bit to perform icsk_rto << icsk_backoff operations As Joe Perches suggested, add a helper for this. Yuchung spotted the tcp_v4_err() case. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 9 +++++++++ net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv4/tcp_output.c | 13 ++++++------- net/ipv4/tcp_timer.c | 4 ++-- 5 files changed, 23 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5fbe6568c3cf..848e85cb5c61 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -242,6 +242,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, #endif } +static inline unsigned long +inet_csk_rto_backoff(const struct inet_connection_sock *icsk, + unsigned long max_when) +{ + u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff; + + return (unsigned long)min_t(u64, when, max_when); +} + struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); struct request_sock *inet_csk_search_req(const struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 02fb66d4a018..13f3da4762e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3208,9 +3208,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { + unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + when, TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 006b045716d8..3b2e49cb2b61 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -430,9 +430,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT) << icsk->icsk_backoff; - tcp_bound_rto(sk); + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : + TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_write_queue_head(sk); BUG_ON(!skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7f1280dcad57..8c61a7c0c889 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3279,6 +3279,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long probe_max; int err; err = tcp_write_wakeup(sk); @@ -3294,9 +3295,7 @@ void tcp_send_probe0(struct sock *sk) if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + probe_max = TCP_RTO_MAX; } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember icsk_probes_out. @@ -3306,11 +3305,11 @@ void tcp_send_probe0(struct sock *sk) */ if (!icsk->icsk_probes_out) icsk->icsk_probes_out = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + probe_max = TCP_RESOURCE_PROBE_INTERVAL; } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + inet_csk_rto_backoff(icsk, probe_max), + TCP_RTO_MAX); } int tcp_rtx_synack(struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a339e7ba05a4..b24360f6e293 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -180,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = (icsk->icsk_rto < TCP_RTO_MAX); + const int alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -294,7 +294,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); + const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); -- cgit v1.2.3 From a2aeb02a8e6a9fef397c344245a54eeae67341f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 13:42:53 -0700 Subject: net: sched: fix compile warning in cls_u32 $ grep CONFIG_CLS_U32_MARK .config # CONFIG_CLS_U32_MARK is not set net/sched/cls_u32.c: In function 'u32_change': net/sched/cls_u32.c:852:1: warning: label 'errout' defined but not used [-Wunused-label] Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index e3fb5308d44a..ef97a646ee94 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -847,9 +847,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); +errout: #endif -errout: #ifdef CONFIG_CLS_U32_PERF free_percpu(n->pf); #endif -- cgit v1.2.3 From 48e68ff5e55af6907d3f90233e5c4d5601a628a6 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Tue, 23 Sep 2014 11:01:07 +0200 Subject: Bluetooth: Check for SCO type before setting retransmission effort SCO connection cannot be setup to devices that do not support retransmission. Patch based on http://permalink.gmane.org/gmane.linux.bluez.kernel/7779 and adapted for this kernel version. Code changed to check SCO/eSCO type before setting retransmission effort and max. latency. The purpose of the patch is to support older devices not capable of eSCO. Tested on Blackberry 655+ headset which does not support retransmission. Credits go to Alexander Sommerhuber. Signed-off-by: Bernhard Thaler Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e3d7ae9e2edd..22b253750f78 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -38,7 +38,7 @@ struct sco_param { u16 max_latency; }; -static const struct sco_param sco_param_cvsd[] = { +static const struct sco_param esco_param_cvsd[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a }, /* S3 */ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007 }, /* S2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0007 }, /* S1 */ @@ -46,6 +46,11 @@ static const struct sco_param sco_param_cvsd[] = { { EDR_ESCO_MASK | ESCO_HV1, 0xffff }, /* D0 */ }; +static const struct sco_param sco_param_cvsd[] = { + { EDR_ESCO_MASK | ESCO_HV3, 0xffff }, /* D1 */ + { EDR_ESCO_MASK | ESCO_HV1, 0xffff }, /* D0 */ +}; + static const struct sco_param sco_param_wideband[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d }, /* T2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0008 }, /* T1 */ @@ -207,10 +212,17 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) param = &sco_param_wideband[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: - if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) - return false; - cp.retrans_effort = 0x01; - param = &sco_param_cvsd[conn->attempt - 1]; + if (lmp_esco_capable(conn->link)) { + if (conn->attempt > ARRAY_SIZE(esco_param_cvsd)) + return false; + cp.retrans_effort = 0x01; + param = &esco_param_cvsd[conn->attempt - 1]; + } else { + if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) + return false; + cp.retrans_effort = 0xff; + param = &sco_param_cvsd[conn->attempt - 1]; + } break; default: return false; -- cgit v1.2.3 From 4cdf507d54525842dfd9f6313fdafba039084046 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2014 07:38:40 -0700 Subject: icmp: add a global rate limitation Current ICMP rate limiting uses inetpeer cache, which is an RBL tree protected by a lock, meaning that hosts can be stuck hard if all cpus want to check ICMP limits. When say a DNS or NTP server process is restarted, inetpeer tree grows quick and machine comes to its knees. iptables can not help because the bottleneck happens before ICMP messages are even cooked and sent. This patch adds a new global limitation, using a token bucket filter, controlled by two new sysctl : icmp_msgs_per_sec - INTEGER Limit maximal number of ICMP packets sent per second from this host. Only messages whose type matches icmp_ratemask are controlled by this limit. Default: 1000 icmp_msgs_burst - INTEGER icmp_msgs_per_sec controls number of ICMP packets sent per second, while icmp_msgs_burst controls the burst size of these packets. Default: 50 Note that if we really want to send millions of ICMP messages per second, we might extend idea and infra added in commit 04ca6973f7c1a ("ip: make IP identifiers less predictable") : add a token bucket in the ip_idents hash and no longer rely on inetpeer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++ include/net/ip.h | 4 +++ net/ipv4/icmp.c | 64 +++++++++++++++++++++++++++++++--- net/ipv4/sysctl_net_ipv4.c | 16 +++++++++ net/ipv6/icmp.c | 20 ++++++----- 5 files changed, 105 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1b5581a30d77..c7a81ace35d0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -769,8 +769,21 @@ icmp_ratelimit - INTEGER icmp_ratemask (see below) to specific targets. 0 to disable any limiting, otherwise the minimal space between responses in milliseconds. + Note that another sysctl, icmp_msgs_per_sec limits the number + of ICMP packets sent on all targets. Default: 1000 +icmp_msgs_per_sec - INTEGER + Limit maximal number of ICMP packets sent per second from this host. + Only messages whose type matches icmp_ratemask (see below) are + controlled by this limit. + Default: 1000 + +icmp_msgs_burst - INTEGER + icmp_msgs_per_sec controls number of ICMP packets sent per second, + while icmp_msgs_burst controls the burst size of these packets. + Default: 50 + icmp_ratemask - INTEGER Mask made of ICMP types for which rates are being limited. Significant bits: IHGFEDCBA9876543210 diff --git a/include/net/ip.h b/include/net/ip.h index 14bfc8e1bcf9..fcd9068fb8c3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -548,6 +548,10 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, u32 info); +bool icmp_global_allow(void); +extern int sysctl_icmp_msgs_per_sec; +extern int sysctl_icmp_msgs_burst; + #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ea7d4afe8205..5882f584910e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock_bh(&sk->sk_lock.slock); } +int sysctl_icmp_msgs_per_sec __read_mostly = 1000; +int sysctl_icmp_msgs_burst __read_mostly = 50; + +static struct { + spinlock_t lock; + u32 credit; + u32 stamp; +} icmp_global = { + .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), +}; + +/** + * icmp_global_allow - Are we allowed to send one more ICMP message ? + * + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Returns false if we reached the limit and can not send another packet. + * Note: called with BH disabled + */ +bool icmp_global_allow(void) +{ + u32 credit, delta, incr = 0, now = (u32)jiffies; + bool rc = false; + + /* Check if token bucket is empty and cannot be refilled + * without taking the spinlock. + */ + if (!icmp_global.credit) { + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta < HZ / 50) + return false; + } + + spin_lock(&icmp_global.lock); + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta >= HZ / 50) { + incr = sysctl_icmp_msgs_per_sec * delta / HZ ; + if (incr) + icmp_global.stamp = now; + } + credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); + if (credit) { + credit--; + rc = true; + } + icmp_global.credit = credit; + spin_unlock(&icmp_global.lock); + return rc; +} +EXPORT_SYMBOL(icmp_global_allow); + /* * Send an ICMP frame. */ -static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) +static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; bool rc = true; @@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + goto out; + + rc = false; + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1599966f4639..8a25509c35b3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -730,6 +730,22 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "icmp_msgs_per_sec", + .data = &sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "icmp_msgs_burst", + .data = &sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 394bb824fe4b..141e1f3ab74e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -170,11 +170,11 @@ static bool is_ineligible(const struct sk_buff *skb) /* * Check the ICMP output rate limit */ -static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) +static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) { - struct dst_entry *dst; struct net *net = sock_net(sk); + struct dst_entry *dst; bool res = false; /* Informational messages are not limited. */ @@ -199,16 +199,20 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; - struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v6(net->ipv6.peers, + &rt->rt6i_dst.addr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); + } } dst_release(dst); return res; -- cgit v1.2.3 From bd1e75abf4b3c666f61a5cf90c896aa928a735d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2014 08:26:20 -0700 Subject: tcp: add coalescing attempt in tcp_ofo_queue() In order to make TCP more resilient in presence of reorders, we need to allow coalescing to happen when skbs from out of order queue are transferred into receive queue. LRO/GRO can be completely canceled in some pathological cases, like per packet load balancing on aggregated links. I had to move tcp_try_coalesce() up in the file above tcp_ofo_queue() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 89 +++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 13f3da4762e3..f3f016a15c5a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4061,6 +4061,44 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + int delta; + + *fragstolen = false; + + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; + + if (!skb_try_coalesce(to, from, fragstolen, &delta)) + return false; + + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + return true; +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4068,7 +4106,8 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; - struct sk_buff *skb; + struct sk_buff *skb, *tail; + bool fragstolen, eaten; while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) @@ -4081,9 +4120,9 @@ static void tcp_ofo_queue(struct sock *sk) tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -4091,11 +4130,15 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, &tp->out_of_order_queue); - __skb_queue_tail(&sk->sk_receive_queue, skb); + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (!eaten) + __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); + if (eaten) + kfree_skb_partial(skb, fragstolen); } } @@ -4122,44 +4165,6 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, return 0; } -/** - * tcp_try_coalesce - try to merge skb to prior one - * @sk: socket - * @to: prior buffer - * @from: buffer to add in queue - * @fragstolen: pointer to boolean - * - * Before queueing skb @from after @to, try to merge them - * to reduce overall memory use and queue lengths, if cost is small. - * Packets in ofo or receive queues can stay a long time. - * Better try to coalesce them right now to avoid future collapses. - * Returns true if caller should free @from instead of queueing it - */ -static bool tcp_try_coalesce(struct sock *sk, - struct sk_buff *to, - struct sk_buff *from, - bool *fragstolen) -{ - int delta; - - *fragstolen = false; - - /* Its possible this segment overlaps with prior segment in queue */ - if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) - return false; - - if (!skb_try_coalesce(to, from, fragstolen, &delta)) - return false; - - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); - TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; - TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; - return true; -} - static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 9e87f9a9c4c4754508b2c2638fbde9e10c7a103b Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sat, 13 Sep 2014 10:28:49 +0200 Subject: NFC: nci: Add support for proprietary RF Protocols In NFC Forum NCI specification, some RF Protocol values are reserved for proprietary use (from 0x80 to 0xfe). Some CLF vendor may need to use one value within this range for specific technology. Furthermore, some CLF may not becompliant with NFC Froum NCI specification 2.0 and therefore will not support RF Protocol value 0x06 for PROTOCOL_T5T as mention in a draft specification and in a recent push. Adding get_rf_protocol handle to the nci_ops structure will help to set the correct technology to target. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- drivers/nfc/st21nfcb/st21nfcb.c | 10 ++++++++++ include/net/nfc/nci_core.h | 9 +++++---- net/nfc/nci/ntf.c | 9 ++++++++- 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/nfc/st21nfcb/st21nfcb.c b/drivers/nfc/st21nfcb/st21nfcb.c index 90d37156169e..e72dae816e72 100644 --- a/drivers/nfc/st21nfcb/st21nfcb.c +++ b/drivers/nfc/st21nfcb/st21nfcb.c @@ -25,6 +25,8 @@ #define DRIVER_DESC "NCI NFC driver for ST21NFCB" +#define ST21NFCB_NCI1_X_PROPRIETARY_ISO15693 0x83 + static int st21nfcb_nci_open(struct nci_dev *ndev) { struct st21nfcb_nci_info *info = nci_get_drvdata(ndev); @@ -64,10 +66,18 @@ static int st21nfcb_nci_send(struct nci_dev *ndev, struct sk_buff *skb) return ndlc_send(info->ndlc, skb); } +static __u32 st21nfcb_nci_get_rfprotocol(struct nci_dev *ndev, + __u8 rf_protocol) +{ + return rf_protocol == ST21NFCB_NCI1_X_PROPRIETARY_ISO15693 ? + NFC_PROTO_ISO15693_MASK : 0; +} + static struct nci_ops st21nfcb_nci_ops = { .open = st21nfcb_nci_open, .close = st21nfcb_nci_close, .send = st21nfcb_nci_send, + .get_rfprotocol = st21nfcb_nci_get_rfprotocol, }; int st21nfcb_nci_probe(struct llt_ndlc *ndlc, int phy_headroom, diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 1f9a0f5272fe..75d10e625c49 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -64,10 +64,11 @@ enum nci_state { struct nci_dev; struct nci_ops { - int (*open)(struct nci_dev *ndev); - int (*close)(struct nci_dev *ndev); - int (*send)(struct nci_dev *ndev, struct sk_buff *skb); - int (*setup)(struct nci_dev *ndev); + int (*open)(struct nci_dev *ndev); + int (*close)(struct nci_dev *ndev); + int (*send)(struct nci_dev *ndev, struct sk_buff *skb); + int (*setup)(struct nci_dev *ndev); + __u32 (*get_rfprotocol)(struct nci_dev *ndev, __u8 rf_protocol); }; #define NCI_MAX_SUPPORTED_RF_INTERFACES 4 diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 25e44cebd60a..205b35f666db 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -167,6 +167,13 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, return data; } +__u32 nci_get_prop_rf_protocol(struct nci_dev *ndev, __u8 rf_protocol) +{ + if (ndev->ops->get_rfprotocol) + return ndev->ops->get_rfprotocol(ndev, rf_protocol); + return 0; +} + static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, @@ -195,7 +202,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, else if (rf_protocol == NCI_RF_PROTOCOL_T5T) protocol = NFC_PROTO_ISO15693_MASK; else - protocol = 0; + protocol = nci_get_prop_rf_protocol(ndev, rf_protocol); if (!(protocol & ndev->poll_prots)) { pr_err("the target found does not have the desired protocol\n"); -- cgit v1.2.3 From 2b0bf6c85a4940e00516f68ff7103329abf8512d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 22 Sep 2014 11:17:41 -0700 Subject: Bluetooth: Convert bt_ logging functions to return void No caller or macro uses the return value so make all the functions return void. Signed-off-by: Joe Perches Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 4 ++-- net/bluetooth/lib.c | 14 ++++---------- 2 files changed, 6 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 7e666d06b97f..58695ffeb138 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -120,9 +120,9 @@ struct bt_voice { #define BT_RCVMTU 13 __printf(1, 2) -int bt_info(const char *fmt, ...); +void bt_info(const char *fmt, ...); __printf(1, 2) -int bt_err(const char *fmt, ...); +void bt_err(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 941ad7530eda..b36bc0415854 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -135,40 +135,34 @@ int bt_to_errno(__u16 code) } EXPORT_SYMBOL(bt_to_errno); -int bt_info(const char *format, ...) +void bt_info(const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = pr_info("%pV", &vaf); + pr_info("%pV", &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(bt_info); -int bt_err(const char *format, ...) +void bt_err(const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = pr_err("%pV", &vaf); + pr_err("%pV", &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(bt_err); -- cgit v1.2.3 From d41c15cf95bd91b9c333f6f749670e22c8a47ad9 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 24 Sep 2014 13:14:46 +0300 Subject: Bluetooth: Fix reason code used for rejecting SCO connections The core specification defines valid values for the HCI_Reject_Synchronous_Connection_Request command to be 0x0D-0x0F. So far the code has been using HCI_ERROR_REMOTE_USER_TERM (0x13) which is not a valid value and is therefore being rejected by some controllers: > HCI Event: Connect Request (0x04) plen 10 bdaddr 40:6F:2A:6A:E5:E0 class 0x000000 type eSCO < HCI Command: Reject Synchronous Connection (0x01|0x002a) plen 7 bdaddr 40:6F:2A:6A:E5:E0 reason 0x13 Reason: Remote User Terminated Connection > HCI Event: Command Status (0x0f) plen 4 Reject Synchronous Connection (0x01|0x002a) status 0x12 ncmd 1 Error: Invalid HCI Command Parameters This patch introduces a new define for a value from the valid range (0x0d == Connection Rejected Due To Limited Resources) and uses it instead for rejecting incoming connections. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_conn.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3f8547f1c6f8..6e8f24967308 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -385,6 +385,7 @@ enum { #define HCI_ERROR_AUTH_FAILURE 0x05 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 +#define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f #define HCI_ERROR_REMOTE_USER_TERM 0x13 #define HCI_ERROR_REMOTE_LOW_RESOURCES 0x14 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 22b253750f78..445829cd363c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -121,7 +121,7 @@ static void hci_reject_sco(struct hci_conn *conn) { struct hci_cp_reject_sync_conn_req cp; - cp.reason = HCI_ERROR_REMOTE_USER_TERM; + cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; bacpy(&cp.bdaddr, &conn->dst); hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); -- cgit v1.2.3 From f19f4f9525cf32f97341fac20ce66392e86a1b67 Mon Sep 17 00:00:00 2001 From: Simon Vincent Date: Wed, 24 Sep 2014 12:21:33 +0200 Subject: ieee802154: 6lowpan: ensure header compression does not corrupt ipv6 header The 6lowpan ipv6 header compression was causing problems for other interfaces that expected a ipv6 header to still be in place, as we were replacing the ipv6 header with a compressed version. This happened if you sent a packet to a multicast address as the packet would be output on 802.15.4, ethernet, and also be sent to the loopback interface. The skb data was shared between these interfaces so all interfaces ended up with a compressed ipv6 header. The solution is to ensure that before we do any header compression we are not sharing the skb or skb data with any other interface. If we are then we must take a copy of the skb and skb data before modifying the ipv6 header. The only place we can copy the skb is inside the xmit function so we don't leave dangling references to skb. This patch moves all the header compression to inside the xmit function. Very little code has been changed it has mostly been moved from lowpan_header_create to lowpan_xmit. At the top of the xmit function we now check if the skb is shared and if so copy it. In lowpan_header_create all we do now is store the source and destination addresses for use later when we compress the header. Signed-off-by: Simon Vincent Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan_rtnl.c | 125 ++++++++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c index 5e788cdc499a..44136297b673 100644 --- a/net/ieee802154/6lowpan_rtnl.c +++ b/net/ieee802154/6lowpan_rtnl.c @@ -71,20 +71,42 @@ struct lowpan_dev_record { struct list_head list; }; +/* don't save pan id, it's intra pan */ +struct lowpan_addr { + u8 mode; + union { + /* IPv6 needs big endian here */ + __be64 extended_addr; + __be16 short_addr; + } u; +}; + +struct lowpan_addr_info { + struct lowpan_addr daddr; + struct lowpan_addr saddr; +}; + static inline struct lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) { return netdev_priv(dev); } +static inline struct +lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) +{ + WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct lowpan_addr_info)); + return (struct lowpan_addr_info *)(skb->data - + sizeof(struct lowpan_addr_info)); +} + static int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { const u8 *saddr = _saddr; const u8 *daddr = _daddr; - struct ieee802154_addr sa, da; - struct ieee802154_mac_cb *cb = mac_cb_init(skb); + struct lowpan_addr_info *info; /* TODO: * if this package isn't ipv6 one, where should it be routed? @@ -98,41 +120,17 @@ static int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); - lowpan_header_compress(skb, dev, type, daddr, saddr, len); - - /* NOTE1: I'm still unsure about the fact that compression and WPAN - * header are created here and not later in the xmit. So wait for - * an opinion of net maintainers. - */ - /* NOTE2: to be absolutely correct, we must derive PANid information - * from MAC subif of the 'dev' and 'real_dev' network devices, but - * this isn't implemented in mainline yet, so currently we assign 0xff - */ - cb->type = IEEE802154_FC_TYPE_DATA; - - /* prepare wpan address data */ - sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - sa.extended_addr = ieee802154_devaddr_from_raw(saddr); - - /* intra-PAN communications */ - da.pan_id = sa.pan_id; - - /* if the destination address is the broadcast address, use the - * corresponding short address - */ - if (lowpan_is_addr_broadcast(daddr)) { - da.mode = IEEE802154_ADDR_SHORT; - da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); - } else { - da.mode = IEEE802154_ADDR_LONG; - da.extended_addr = ieee802154_devaddr_from_raw(daddr); - } + info = lowpan_skb_priv(skb); - cb->ackreq = !lowpan_is_addr_broadcast(daddr); + /* TODO: Currently we only support extended_addr */ + info->daddr.mode = IEEE802154_ADDR_LONG; + memcpy(&info->daddr.u.extended_addr, daddr, + sizeof(info->daddr.u.extended_addr)); + info->saddr.mode = IEEE802154_ADDR_LONG; + memcpy(&info->saddr.u.extended_addr, saddr, + sizeof(info->daddr.u.extended_addr)); - return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, - type, (void *)&da, (void *)&sa, 0); + return 0; } static int lowpan_give_skb_to_devices(struct sk_buff *skb, @@ -330,13 +328,68 @@ err: return rc; } +static int lowpan_header(struct sk_buff *skb, struct net_device *dev) +{ + struct ieee802154_addr sa, da; + struct ieee802154_mac_cb *cb = mac_cb_init(skb); + struct lowpan_addr_info info; + void *daddr, *saddr; + + memcpy(&info, lowpan_skb_priv(skb), sizeof(info)); + + /* TODO: Currently we only support extended_addr */ + daddr = &info.daddr.u.extended_addr; + saddr = &info.saddr.u.extended_addr; + + lowpan_header_compress(skb, dev, ETH_P_IPV6, daddr, saddr, skb->len); + + cb->type = IEEE802154_FC_TYPE_DATA; + + /* prepare wpan address data */ + sa.mode = IEEE802154_ADDR_LONG; + sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + sa.extended_addr = ieee802154_devaddr_from_raw(saddr); + + /* intra-PAN communications */ + da.pan_id = sa.pan_id; + + /* if the destination address is the broadcast address, use the + * corresponding short address + */ + if (lowpan_is_addr_broadcast((const u8 *)daddr)) { + da.mode = IEEE802154_ADDR_SHORT; + da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + cb->ackreq = false; + } else { + da.mode = IEEE802154_ADDR_LONG; + da.extended_addr = ieee802154_devaddr_from_raw(daddr); + cb->ackreq = true; + } + + return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, + ETH_P_IPV6, (void *)&da, (void *)&sa, 0); +} + static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee802154_hdr wpan_hdr; - int max_single; + int max_single, ret; pr_debug("package xmit\n"); + /* We must take a copy of the skb before we modify/replace the ipv6 + * header as the header could be used elsewhere + */ + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + ret = lowpan_header(skb, dev); + if (ret < 0) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + if (ieee802154_hdr_peek(skb, &wpan_hdr) < 0) { kfree_skb(skb); return NET_XMIT_DROP; -- cgit v1.2.3 From c7da579763f29cf45a861ad4c339aba590d8b80d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 24 Sep 2014 22:41:46 +0300 Subject: Bluetooth: Add retransmission effort into SCO parameter table It is expected that new parameter combinations will have the retransmission effort value different between some entries (mainly because of the new S4 configuration added by HFP 1.7), so it makes sense to move it into the table instead of having it hard coded based on the selected SCO_AIRMODE_*. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 445829cd363c..06047142797c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -36,24 +36,25 @@ struct sco_param { u16 pkt_type; u16 max_latency; + u8 retrans_effort; }; static const struct sco_param esco_param_cvsd[] = { - { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a }, /* S3 */ - { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007 }, /* S2 */ - { EDR_ESCO_MASK | ESCO_EV3, 0x0007 }, /* S1 */ - { EDR_ESCO_MASK | ESCO_HV3, 0xffff }, /* D1 */ - { EDR_ESCO_MASK | ESCO_HV1, 0xffff }, /* D0 */ + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */ + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */ + { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */ + { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */ + { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */ }; static const struct sco_param sco_param_cvsd[] = { - { EDR_ESCO_MASK | ESCO_HV3, 0xffff }, /* D1 */ - { EDR_ESCO_MASK | ESCO_HV1, 0xffff }, /* D0 */ + { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */ + { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */ }; static const struct sco_param sco_param_wideband[] = { - { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d }, /* T2 */ - { EDR_ESCO_MASK | ESCO_EV3, 0x0008 }, /* T1 */ + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */ + { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; static void hci_le_create_connection_cancel(struct hci_conn *conn) @@ -208,19 +209,16 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) case SCO_AIRMODE_TRANSP: if (conn->attempt > ARRAY_SIZE(sco_param_wideband)) return false; - cp.retrans_effort = 0x02; param = &sco_param_wideband[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (lmp_esco_capable(conn->link)) { if (conn->attempt > ARRAY_SIZE(esco_param_cvsd)) return false; - cp.retrans_effort = 0x01; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) return false; - cp.retrans_effort = 0xff; param = &sco_param_cvsd[conn->attempt - 1]; } break; @@ -228,6 +226,7 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return false; } + cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); -- cgit v1.2.3 From 565766b087a6d6ff257f5b79c8ceda0188c9169f Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 25 Sep 2014 09:48:01 +0300 Subject: Bluetooth: Rename sco_param_wideband table to esco_param_msbc The sco_param_wideband table represents the eSCO parameters for specifically mSBC encoding. This patch renames the table to the more descriptive esco_param_msbc name. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 06047142797c..b9517bd17190 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -52,7 +52,7 @@ static const struct sco_param sco_param_cvsd[] = { { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */ }; -static const struct sco_param sco_param_wideband[] = { +static const struct sco_param esco_param_msbc[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; @@ -207,9 +207,9 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: - if (conn->attempt > ARRAY_SIZE(sco_param_wideband)) + if (conn->attempt > ARRAY_SIZE(esco_param_msbc)) return false; - param = &sco_param_wideband[conn->attempt - 1]; + param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (lmp_esco_capable(conn->link)) { -- cgit v1.2.3 From d020f8f73318589bf41f864b7f89f95669350873 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 20 Sep 2014 14:52:28 -0700 Subject: tcp: move logic out of tcp_v[64]_gso_send_check In tcp_v[46]_gso_send_check the TCP checksum is initialized to the pseudo header checksum using __tcp_v[46]_send_check. We can move this logic into new tcp[46]_gso_segment functions to be done when ip_summed != CHECKSUM_PARTIAL (ip_summed == CHECKSUM_PARTIAL should be the common case, possibly always true when taking GSO path). After this change tcp_v[46]_gso_send_check is no-op. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 39 +++++++++++++++++++++++---------------- net/ipv6/tcpv6_offload.c | 37 ++++++++++++++++++++++++------------- 2 files changed, 47 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 72912533a191..7cd12b0458ff 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -29,6 +29,28 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, } } +struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + return ERR_PTR(-EINVAL); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + /* Set up checksum pseudo header, usually expect stack to + * have done this already. + */ + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); + } + + return tcp_gso_segment(skb, features); +} + struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -44,9 +66,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, __sum16 newcheck; bool ooo_okay, copy_destructor; - if (!pskb_may_pull(skb, sizeof(*th))) - goto out; - th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) @@ -271,18 +290,6 @@ EXPORT_SYMBOL(tcp_gro_complete); static int tcp_v4_gso_send_check(struct sk_buff *skb) { - const struct iphdr *iph; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - iph = ip_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - skb->ip_summed = CHECKSUM_PARTIAL; - __tcp_v4_send_check(skb, iph->saddr, iph->daddr); return 0; } @@ -314,7 +321,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) static const struct net_offload tcpv4_offload = { .callbacks = { .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_gso_segment, + .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index dbb3d9262bf6..96253154db3a 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -17,18 +17,6 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb) { - const struct ipv6hdr *ipv6h; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - ipv6h = ipv6_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - skb->ip_summed = CHECKSUM_PARTIAL; - __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); return 0; } @@ -58,10 +46,33 @@ static int tcp6_gro_complete(struct sk_buff *skb, int thoff) return tcp_gro_complete(skb); } +struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return ERR_PTR(-EINVAL); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + /* Set up pseudo header, usually expect stack to have done + * this. + */ + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); + } + + return tcp_gso_segment(skb, features); +} static const struct net_offload tcpv6_offload = { .callbacks = { .gso_send_check = tcp_v6_gso_send_check, - .gso_segment = tcp_gso_segment, + .gso_segment = tcp6_gso_segment, .gro_receive = tcp6_gro_receive, .gro_complete = tcp6_gro_complete, }, -- cgit v1.2.3 From f71470b37e79d6eb151debd47364d920b7babd30 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 20 Sep 2014 14:52:29 -0700 Subject: udp: move logic out of udp[46]_ufo_send_check In udp[46]_ufo_send_check the UDP checksum initialized to the pseudo header checksum. We can move this logic into udp[46]_ufo_fragment. After this change udp[64]_ufo_send_check is a no-op. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 37 +++++++++++++++---------------------- net/ipv6/udp_offload.c | 40 ++++++++++++++++++---------------------- 2 files changed, 33 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d7c43f764c71..2918cc914824 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -27,23 +27,6 @@ struct udp_offload_priv { static int udp4_ufo_send_check(struct sk_buff *skb) { - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - return -EINVAL; - - if (likely(!skb->encapsulation)) { - const struct iphdr *iph; - struct udphdr *uh; - - iph = ip_hdr(skb); - uh = udp_hdr(skb); - - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - } - return 0; } @@ -128,8 +111,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; - int offset; __wsum csum; + struct udphdr *uh; + struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & @@ -138,6 +122,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -165,10 +152,16 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, * HW cannot do checksum of UDP packets sent as multiple * IP fragments. */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + + uh = udp_hdr(skb); + iph = ip_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_NONE; /* Fragment the skb. IP headers of the fragments are updated in diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index de85f809bf29..e4af6437ea3b 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -19,23 +19,6 @@ static int udp6_ufo_send_check(struct sk_buff *skb) { - const struct ipv6hdr *ipv6h; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(*uh))) - return -EINVAL; - - if (likely(!skb->encapsulation)) { - ipv6h = ipv6_hdr(skb); - uh = udp_hdr(skb); - - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - } - return 0; } @@ -49,7 +32,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u8 *packet_start, *prevhdr; u8 nexthdr; u8 frag_hdr_sz = sizeof(struct frag_hdr); - int offset; __wsum csum; int tnl_hlen; @@ -83,13 +65,27 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features); else { + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot * do checksum of UDP packets sent as multiple IP fragments. */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + + uh = udp_hdr(skb); + ipv6h = ipv6_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v6_check(skb->len, &ipv6h->saddr, + &ipv6h->daddr, csum); + + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_NONE; /* Check if there is enough headroom to insert fragment header. */ -- cgit v1.2.3 From 53e50398968d43338c4d932114e68bc099fc5fbd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 20 Sep 2014 14:52:30 -0700 Subject: net: Remove gso_send_check as an offload callback The send_check logic was only interesting in cases of TCP offload and UDP UFO where the checksum needed to be initialized to the pseudo header checksum. Now we've moved that logic into the related gso_segment functions so gso_send_check is no longer needed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 10 ---------- net/ipv4/af_inet.c | 36 ------------------------------------ net/ipv4/gre_offload.c | 11 +++-------- net/ipv4/tcp_offload.c | 6 ------ net/ipv4/udp_offload.c | 6 ------ net/ipv6/ip6_offload.c | 27 --------------------------- net/ipv6/tcpv6_offload.c | 6 ------ net/ipv6/udp_offload.c | 6 ------ net/mpls/mpls_gso.c | 7 ------- 10 files changed, 3 insertions(+), 113 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4354b4307e37..9f5d293a0281 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1911,7 +1911,6 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - int (*gso_send_check)(struct sk_buff *skb); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); diff --git a/net/core/dev.c b/net/core/dev.c index db0388607329..e2ced01c01e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2422,16 +2422,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } segs = ptype->callbacks.gso_segment(skb, features); break; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 72011cc4c13b..28e589c5f32d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1197,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); -static int inet_gso_send_check(struct sk_buff *skb) -{ - const struct net_offload *ops; - const struct iphdr *iph; - int proto; - int ihl; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) - goto out; - - iph = ip_hdr(skb); - ihl = iph->ihl * 4; - if (ihl < sizeof(*iph)) - goto out; - - proto = iph->protocol; - - /* Warning: after this point, iph might be no longer valid */ - if (unlikely(!pskb_may_pull(skb, ihl))) - goto out; - __skb_pull(skb, ihl); - - skb_reset_transport_header(skb); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_send_check)) - err = ops->callbacks.gso_send_check(skb); - -out: - return err; -} - static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -1655,7 +1621,6 @@ static int ipv4_proc_init(void); static struct packet_offload ip_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, @@ -1664,7 +1629,6 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d3fe2ac05167..a77729503071 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,13 +15,6 @@ #include #include -static int gre_gso_send_check(struct sk_buff *skb) -{ - if (!skb->encapsulation) - return -EINVAL; - return 0; -} - static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_IPIP))) goto out; + if (!skb->encapsulation) + goto out; + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) goto out; @@ -256,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload gre_offload = { .callbacks = { - .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 7cd12b0458ff..5b90f2f447a5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -288,11 +288,6 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static int tcp_v4_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -320,7 +315,6 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) static const struct net_offload tcpv4_offload = { .callbacks = { - .gso_send_check = tcp_v4_gso_send_check, .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2918cc914824..19ebe6a39ddc 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,11 +25,6 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -static int udp4_ufo_send_check(struct sk_buff *skb) -{ - return 0; -} - struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features) { @@ -346,7 +341,6 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9952f3fce30a..9034f76ae013 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -53,31 +53,6 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) return proto; } -static int ipv6_gso_send_check(struct sk_buff *skb) -{ - const struct ipv6hdr *ipv6h; - const struct net_offload *ops; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet6_offloads[ - ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); - - if (likely(ops && ops->callbacks.gso_send_check)) { - skb_reset_transport_header(skb); - err = ops->callbacks.gso_send_check(skb); - } - -out: - return err; -} - static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -306,7 +281,6 @@ out_unlock: static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, @@ -315,7 +289,6 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 96253154db3a..c1ab77105b4c 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,11 +15,6 @@ #include #include "ip6_offload.h" -static int tcp_v6_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -71,7 +66,6 @@ struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, } static const struct net_offload tcpv6_offload = { .callbacks = { - .gso_send_check = tcp_v6_gso_send_check, .gso_segment = tcp6_gso_segment, .gro_receive = tcp6_gro_receive, .gro_complete = tcp6_gro_complete, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e4af6437ea3b..212ebfc7973f 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,11 +17,6 @@ #include #include "ip6_offload.h" -static int udp6_ufo_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -166,7 +161,6 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_send_check = udp6_ufo_send_check, .gso_segment = udp6_ufo_fragment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 6b38d083e1c9..e28ed2ef5b06 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -65,15 +65,9 @@ out: return segs; } -static int mpls_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct packet_offload mpls_mc_offload = { .type = cpu_to_be16(ETH_P_MPLS_MC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; @@ -81,7 +75,6 @@ static struct packet_offload mpls_mc_offload = { static struct packet_offload mpls_uc_offload = { .type = cpu_to_be16(ETH_P_MPLS_UC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; -- cgit v1.2.3 From 4a8e320c929991c9480a7b936512c57ea02d87b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 20 Sep 2014 18:01:30 -0700 Subject: net: sched: use pinned timers While using a MQ + NETEM setup, I had confirmation that the default timer migration ( /proc/sys/kernel/timer_migration ) is killing us. Installing this on a receiver side of a TCP_STREAM test, (NIC has 8 TX queues) : EST="est 1sec 4sec" for ETH in eth1 do tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq tc qd add dev $ETH parent 1:1 $EST netem limit 70000 delay 6ms tc qd add dev $ETH parent 1:2 $EST netem limit 70000 delay 8ms tc qd add dev $ETH parent 1:3 $EST netem limit 70000 delay 10ms tc qd add dev $ETH parent 1:4 $EST netem limit 70000 delay 12ms tc qd add dev $ETH parent 1:5 $EST netem limit 70000 delay 14ms tc qd add dev $ETH parent 1:6 $EST netem limit 70000 delay 16ms tc qd add dev $ETH parent 1:7 $EST netem limit 80000 delay 18ms tc qd add dev $ETH parent 1:8 $EST netem limit 90000 delay 20ms done We can see that timers get migrated into a single cpu, presumably idle at the time timers are set up. Then all qdisc dequeues run from this cpu and huge lock contention happens. This single cpu is stuck in softirq mode and cannot dequeue fast enough. 39.24% [kernel] [k] _raw_spin_lock 2.65% [kernel] [k] netem_enqueue 1.80% [kernel] [k] netem_dequeue 1.63% [kernel] [k] copy_user_enhanced_fast_string 1.45% [kernel] [k] _raw_spin_lock_bh By pinning qdisc timers on the cpu running the qdisc, we respect proper XPS setting and remove this lock contention. 5.84% [kernel] [k] netem_enqueue 4.83% [kernel] [k] _raw_spin_lock 2.92% [kernel] [k] copy_user_enhanced_fast_string Current Qdiscs that benefit from this change are : netem, cbq, fq, hfsc, tbf, htb. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 4 ++-- net/sched/sch_cbq.c | 4 ++-- net/sched/sch_htb.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ca6248345937..15e7beee266c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -586,7 +586,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } @@ -602,7 +602,7 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) hrtimer_start(&wd->timer, ns_to_ktime(expires), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); } EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index a3244a800501..d2cd981ba60d 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -617,7 +617,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) time = ktime_set(0, 0); time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); - hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); + hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); } qdisc_unthrottled(sch); @@ -1386,7 +1386,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.minidle = -0x7FFFFFFF; qdisc_watchdog_init(&q->watchdog, sch); - hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 14408f262143..063e953d9848 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -932,7 +932,7 @@ ok: ktime_t time = ns_to_ktime(next_event); qdisc_throttled(q->watchdog.qdisc); hrtimer_start(&q->watchdog.timer, time, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); } } else { schedule_work(&q->work); -- cgit v1.2.3 From 772476df7047db87ac4174d1ed396512912d23bf Mon Sep 17 00:00:00 2001 From: Rob Jones Date: Fri, 19 Sep 2014 11:27:51 +0100 Subject: net/netfilter/x_tables.c: use __seq_open_private() Reduce boilerplate code by using __seq_open_private() instead of seq_open() in xt_match_open() and xt_target_open(). Signed-off-by: Rob Jones Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 272ae4d6fdf4..133eb4772f12 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1101,22 +1101,11 @@ static const struct seq_operations xt_match_seq_ops = { static int xt_match_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nf_mttg_trav *trav; - int ret; - - trav = kmalloc(sizeof(*trav), GFP_KERNEL); - if (trav == NULL) + trav = __seq_open_private(file, &xt_match_seq_ops, sizeof(*trav)); + if (!trav) return -ENOMEM; - ret = seq_open(file, &xt_match_seq_ops); - if (ret < 0) { - kfree(trav); - return ret; - } - - seq = file->private_data; - seq->private = trav; trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1165,22 +1154,11 @@ static const struct seq_operations xt_target_seq_ops = { static int xt_target_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nf_mttg_trav *trav; - int ret; - - trav = kmalloc(sizeof(*trav), GFP_KERNEL); - if (trav == NULL) + trav = __seq_open_private(file, &xt_target_seq_ops, sizeof(*trav)); + if (!trav) return -ENOMEM; - ret = seq_open(file, &xt_target_seq_ops); - if (ret < 0) { - kfree(trav); - return ret; - } - - seq = file->private_data; - seq->private = trav; trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } -- cgit v1.2.3 From 7276ca3fa23864133f5ee7431c51546d9b7f695f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Sep 2014 13:28:16 +0200 Subject: netfilter: bridge: nf_bridge_copy_header as static inline in header Move nf_bridge_copy_header() as static inline in netfilter_bridge.h header file. This patch prepares the modularization of the br_netfilter code. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 48 +++++++++++++++++++++++++++++++--------- net/bridge/br_netfilter.c | 28 ----------------------- 2 files changed, 38 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 8ab1c278b66d..fe996d59de64 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -24,16 +24,6 @@ enum nf_br_hook_priorities { #define BRNF_8021Q 0x10 #define BRNF_PPPoE 0x20 -/* Only used in br_forward.c */ -int nf_bridge_copy_header(struct sk_buff *skb); -static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) -{ - if (skb->nf_bridge && - skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) - return nf_bridge_copy_header(skb); - return 0; -} - static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { @@ -46,6 +36,44 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) } } +static inline void nf_bridge_update_protocol(struct sk_buff *skb) +{ + if (skb->nf_bridge->mask & BRNF_8021Q) + skb->protocol = htons(ETH_P_8021Q); + else if (skb->nf_bridge->mask & BRNF_PPPoE) + skb->protocol = htons(ETH_P_PPP_SES); +} + +/* Fill in the header for fragmented IP packets handled by + * the IPv4 connection tracking code. + * + * Only used in br_forward.c + */ +static inline int nf_bridge_copy_header(struct sk_buff *skb) +{ + int err; + unsigned int header_size; + + nf_bridge_update_protocol(skb); + header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + err = skb_cow_head(skb, header_size); + if (err) + return err; + + skb_copy_to_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); + __skb_push(skb, nf_bridge_encap_header_len(skb)); + return 0; +} + +static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) +{ + if (skb->nf_bridge && + skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) + return nf_bridge_copy_header(skb); + return 0; +} + static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE)) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a615264cf01a..61929a7cd815 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -245,14 +245,6 @@ static inline void nf_bridge_save_header(struct sk_buff *skb) skb->nf_bridge->data, header_size); } -static inline void nf_bridge_update_protocol(struct sk_buff *skb) -{ - if (skb->nf_bridge->mask & BRNF_8021Q) - skb->protocol = htons(ETH_P_8021Q); - else if (skb->nf_bridge->mask & BRNF_PPPoE) - skb->protocol = htons(ETH_P_PPP_SES); -} - /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format @@ -320,26 +312,6 @@ drop: return -1; } -/* Fill in the header for fragmented IP packets handled by - * the IPv4 connection tracking code. - */ -int nf_bridge_copy_header(struct sk_buff *skb) -{ - int err; - unsigned int header_size; - - nf_bridge_update_protocol(skb); - header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); - err = skb_cow_head(skb, header_size); - if (err) - return err; - - skb_copy_to_linear_data_offset(skb, -header_size, - skb->nf_bridge->data, header_size); - __skb_push(skb, nf_bridge_encap_header_len(skb)); - return 0; -} - /* PF_BRIDGE/PRE_ROUTING *********************************************/ /* Undo the changes made for ip6tables PREROUTING and continue the * bridge PRE_ROUTING hook. */ -- cgit v1.2.3 From 34666d467cbf1e2e3c7bb15a63eccfb582cdd71f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Sep 2014 11:29:03 +0200 Subject: netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- include/linux/netfilter_bridge.h | 2 +- include/linux/skbuff.h | 12 ++--- include/net/neighbour.h | 2 +- include/net/netfilter/ipv4/nf_reject.h | 2 +- include/net/netfilter/ipv6/nf_reject.h | 2 +- net/Kconfig | 7 +-- net/bridge/Makefile | 5 +- net/bridge/br.c | 14 ++--- net/bridge/br_device.c | 4 +- net/bridge/br_forward.c | 2 + net/bridge/br_input.c | 1 + net/bridge/br_netfilter.c | 88 ++++++------------------------- net/bridge/br_netlink.c | 2 +- net/bridge/br_nf_core.c | 96 ++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 12 ++--- net/bridge/br_sysfs_br.c | 4 +- 16 files changed, 151 insertions(+), 104 deletions(-) create mode 100644 net/bridge/br_nf_core.c (limited to 'net') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index fe996d59de64..c755e4971fa3 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -15,7 +15,7 @@ enum nf_br_hook_priorities { NF_BR_PRI_LAST = INT_MAX, }; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #define BRNF_PKT_TYPE 0x01 #define BRNF_BRIDGED_DNAT 0x02 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..c4ff43f84573 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -156,7 +156,7 @@ struct nf_conntrack { }; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { atomic_t use; unsigned int mask; @@ -560,7 +560,7 @@ struct sk_buff { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info *nf_bridge; #endif @@ -2977,7 +2977,7 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { if (nf_bridge && atomic_dec_and_test(&nf_bridge->use)) @@ -2995,7 +2995,7 @@ static inline void nf_reset(struct sk_buff *skb) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); skb->nf_bridge = NULL; #endif @@ -3016,7 +3016,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif @@ -3030,7 +3030,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 47f425464f84..f60558d0254c 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -373,7 +373,7 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) return 0; } -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 931fbf812171..f713b5a31d62 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -98,7 +98,7 @@ static void nf_send_reset(struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 710d17ed70b4..7a10cfcd8e33 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -147,7 +147,7 @@ static void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/net/Kconfig b/net/Kconfig index 4051fdfa4367..dc5d700d05e7 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -176,10 +176,11 @@ config NETFILTER_ADVANCED If unsure, say Y. config BRIDGE_NETFILTER - bool "Bridged IP/ARP packets filtering" - depends on BRIDGE && NETFILTER && INET + tristate "Bridged IP/ARP packets filtering" + depends on (BRIDGE || BRIDGE=n) + depends on NETFILTER && INET depends on NETFILTER_ADVANCED - default y + default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 8590b942bffa..5e3eac5dc8b9 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,11 +6,12 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o br_netlink.o + br_stp_if.o br_stp_timer.o br_netlink.o \ + br_nf_core.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o -bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o +obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 1a755a1e5410..44425aff7cba 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -161,7 +161,7 @@ static int __init br_init(void) if (err) goto err_out1; - err = br_netfilter_init(); + err = br_nf_core_init(); if (err) goto err_out2; @@ -179,11 +179,16 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif + pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " + "deprecated. Update your scripts to load br_netfilter if you " + "need this.\n"); + return 0; + err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: - br_netfilter_fini(); + br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: @@ -196,20 +201,17 @@ err_out: static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); - br_netlink_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); - unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ - br_netfilter_fini(); + br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif - br_fdb_fini(); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 568cccd39a3d..659cac15c0df 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,7 +36,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) u16 vid = 0; rcu_read_lock(); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); rcu_read_unlock(); @@ -167,7 +167,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); #endif diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 056b67b0e277..992ec49a96aa 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -49,6 +49,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) return 0; } +EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct sk_buff *skb) { @@ -56,6 +57,7 @@ int br_forward_finish(struct sk_buff *skb) br_dev_queue_push_xmit); } +EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 366c43649079..6fd5522df696 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -140,6 +140,7 @@ drop: kfree_skb(skb); goto out; } +EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 61929a7cd815..97e43937aaca 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -111,66 +111,6 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) -{ -} - -static void fake_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - return NULL; -} - -static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return NULL; -} - -static unsigned int fake_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - -static struct dst_ops fake_dst_ops = { - .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), - .update_pmtu = fake_update_pmtu, - .redirect = fake_redirect, - .cow_metrics = fake_cow_metrics, - .neigh_lookup = fake_neigh_lookup, - .mtu = fake_mtu, -}; - -/* - * Initialize bogus route table used to keep netfilter happy. - * Currently, we fill in the PMTU entry because netfilter - * refragmentation needs it, and the rt_flags entry because - * ipt_REJECT needs it. Future netfilter modules might - * require us to fill additional fields. - */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - -void br_netfilter_rtable_init(struct net_bridge *br) -{ - struct rtable *rt = &br->fake_rtable; - - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; - rt->dst.ops = &fake_dst_ops; -} - static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) { struct net_bridge_port *port; @@ -1031,38 +971,42 @@ static struct ctl_table brnf_table[] = { }; #endif -int __init br_netfilter_init(void) +static int __init br_netfilter_init(void) { int ret; - ret = dst_entries_init(&fake_dst_ops); + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret < 0) return ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) { - dst_entries_destroy(&fake_dst_ops); - return ret; - } #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - dst_entries_destroy(&fake_dst_ops); - return -ENOMEM; + ret = -ENOMEM; + goto err1; } #endif printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; +err1: + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return ret; } -void br_netfilter_fini(void) +static void __exit br_netfilter_fini(void) { nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif - dst_entries_destroy(&fake_dst_ops); } + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 90a91e137acc..0fa66b83685f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -602,7 +602,7 @@ out_af: return err; } -void __exit br_netlink_fini(void) +void br_netlink_fini(void) { br_mdb_uninit(); rtnl_af_unregister(&br_af_ops); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c new file mode 100644 index 000000000000..387cb3bd017c --- /dev/null +++ b/net/bridge/br_nf_core.c @@ -0,0 +1,96 @@ +/* + * Handle firewalling core + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include + +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void fake_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} + +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +static unsigned int fake_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops fake_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .update_pmtu = fake_update_pmtu, + .redirect = fake_redirect, + .cow_metrics = fake_cow_metrics, + .neigh_lookup = fake_neigh_lookup, + .mtu = fake_mtu, +}; + +/* + * Initialize bogus route table used to keep netfilter happy. + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. + */ +static const u32 br_dst_default_metrics[RTAX_MAX] = { + [RTAX_MTU - 1] = 1500, +}; + +void br_netfilter_rtable_init(struct net_bridge *br) +{ + struct rtable *rt = &br->fake_rtable; + + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; + rt->dst.ops = &fake_dst_ops; +} + +int __init br_nf_core_init(void) +{ + return dst_entries_init(&fake_dst_ops); +} + +void br_nf_core_fini(void) +{ + dst_entries_destroy(&fake_dst_ops); +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 62a7fa2e3569..d304d752c091 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -221,7 +221,7 @@ struct net_bridge struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct rtable fake_rtable; bool nf_call_iptables; bool nf_call_ip6tables; @@ -751,13 +751,13 @@ static inline int br_vlan_enabled(struct net_bridge *br) #endif /* br_netfilter.c */ -#ifdef CONFIG_BRIDGE_NETFILTER -int br_netfilter_init(void); -void br_netfilter_fini(void); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +int br_nf_core_init(void); +void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else -#define br_netfilter_init() (0) -#define br_netfilter_fini() do { } while (0) +static inline int br_nf_core_init(void) { return 0; } +static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c9e2572b15f4..cb431c6016ee 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -629,7 +629,7 @@ static ssize_t multicast_startup_query_interval_store( } static DEVICE_ATTR_RW(multicast_startup_query_interval); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { @@ -763,7 +763,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, -- cgit v1.2.3 From 6ea754eb761d9e7a8ac6fa462b05f9e4cf04fb6c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 22 Sep 2014 11:10:50 -0700 Subject: net: Change netdev_ logging functions to return void No caller or macro uses the return value so make all the functions return void. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/linux/netdevice.h | 19 +++++++++---------- net/core/dev.c | 44 +++++++++++++++++--------------------------- 2 files changed, 26 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f5d293a0281..9b7fbacb6296 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3612,22 +3612,22 @@ static inline const char *netdev_reg_state(const struct net_device *dev) } __printf(3, 4) -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); __printf(2, 3) -int netdev_emerg(const struct net_device *dev, const char *format, ...); +void netdev_emerg(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_alert(const struct net_device *dev, const char *format, ...); +void netdev_alert(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_crit(const struct net_device *dev, const char *format, ...); +void netdev_crit(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_err(const struct net_device *dev, const char *format, ...); +void netdev_err(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_warn(const struct net_device *dev, const char *format, ...); +void netdev_warn(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_notice(const struct net_device *dev, const char *format, ...); +void netdev_notice(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_info(const struct net_device *dev, const char *format, ...); +void netdev_info(const struct net_device *dev, const char *format, ...); #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) @@ -3645,7 +3645,6 @@ do { \ ({ \ if (0) \ netdev_printk(KERN_DEBUG, __dev, format, ##args); \ - 0; \ }) #endif diff --git a/net/core/dev.c b/net/core/dev.c index e2ced01c01e4..e55c546717d4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7066,53 +7066,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s%s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), netdev_reg_state(dev), - vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s%s: %pV", level, netdev_name(dev), - netdev_reg_state(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -7121,11 +7113,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); -- cgit v1.2.3 From f4a775d14489a801a5b8b0540e23ab82e2703091 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 16:29:32 -0700 Subject: net: introduce __skb_header_release() While profiling TCP stack, I noticed one useless atomic operation in tcp_sendmsg(), caused by skb_header_release(). It turns out all current skb_header_release() users have a fresh skb, that no other user can see, so we can avoid one atomic operation. Introduce __skb_header_release() to clearly document this. This gave me a 1.5 % improvement on TCP_RR workload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ net/core/skbuff.c | 4 ++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_output.c | 10 +++++----- 4 files changed, 24 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1bfa3781c75..8eaa62400fca 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1083,6 +1083,7 @@ static inline int skb_header_cloned(const struct sk_buff *skb) * Drop a reference to the header part of the buffer. This is done * by acquiring a payload reference. You must not read from the header * part of skb->data after this. + * Note : Check if you can use __skb_header_release() instead. */ static inline void skb_header_release(struct sk_buff *skb) { @@ -1091,6 +1092,20 @@ static inline void skb_header_release(struct sk_buff *skb) atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); } +/** + * __skb_header_release - release reference to header + * @skb: buffer to operate on + * + * Variant of skb_header_release() assuming skb is private to caller. + * We can avoid one atomic operation. + */ +static inline void __skb_header_release(struct sk_buff *skb) +{ + skb->nohdr = 1; + atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); +} + + /** * skb_shared - is the buffer shared * @skb: buffer to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06a8feb10099..512dc7dcbc32 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3179,7 +3179,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; - skb_header_release(p); + __skb_header_release(p); NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; @@ -3211,7 +3211,7 @@ merge: else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + __skb_header_release(skb); lp = p; done: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 070aeff1b131..553b01f52f71 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -609,7 +609,7 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +static void skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -618,7 +618,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c61a7c0c889..f173b1c4f815 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -995,7 +995,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -1167,7 +1167,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, } /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1671,7 +1671,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_set_skb_tso_segs(sk, buff, mss_now); /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -2772,7 +2772,7 @@ int tcp_send_synack(struct sock *sk) if (nskb == NULL) return -ENOMEM; tcp_unlink_write_queue(skb, sk); - skb_header_release(nskb); + __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); sk_wmem_free_skb(sk, skb); sk->sk_wmem_queued += nskb->truesize; @@ -2947,7 +2947,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; - skb_header_release(skb); + __skb_header_release(skb); __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); -- cgit v1.2.3 From 58e3cac5613aa01720e55637d8f1c966a25c3202 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 23 Sep 2014 10:54:37 +0200 Subject: net: optimise inet_proto_csum_replace4() csum_partial() is a generic function which is not optimised for small fixed length calculations, and its use requires to store "from" and "to" values in memory while we already have them available in registers. This also has impact, especially on RISC processors. In the same spirit as the change done by Eric Dumazet on csum_replace2(), this patch rewrites inet_proto_csum_replace4() taking into account RFC1624. I spotted during a NATted tcp transfert that csum_partial() is one of top 5 consuming functions (around 8%), and the second user of csum_partial() is inet_proto_csum_replace4(). Signed-off-by: Christophe Leroy Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/utils.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index eed34338736c..efc76dd9dcd1 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -306,16 +306,14 @@ EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, int pseudohdr) { - __be32 diff[] = { ~from, to }; if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial(diff, sizeof(diff), - ~csum_unfold(*sum))); + *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), + to)); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); + skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to); } else if (pseudohdr) - *sum = ~csum_fold(csum_partial(diff, sizeof(diff), - csum_unfold(*sum))); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from), + to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); -- cgit v1.2.3 From 8280bf00fdd87de876c4add5fa8ed05c18c78a2f Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 23 Sep 2014 20:02:15 +0800 Subject: net/openvswitch: remove dup comment in vport.h Remove the duplicated comment "/* The following definitions are for users of the vport subsytem: */" in vport.h Signed-off-by: Wang Sheng-Hui Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 0d95b9f5f9c4..0efd62fef1e8 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -35,7 +35,6 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -/* The following definitions are for users of the vport subsytem: */ struct vport_net { struct vport __rcu *gre_vport; }; -- cgit v1.2.3 From ff04a771ad25fc9ba91690e73465b4d34b6bf8b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Sep 2014 18:39:30 -0700 Subject: net : optimize skb_release_data() Cache skb_shinfo(skb) in a variable to avoid computing it multiple times. Reorganize the tests to remove one indentation level. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 512dc7dcbc32..d4fdc649112c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -491,32 +491,33 @@ static void skb_free_head(struct sk_buff *skb) static void skb_release_data(struct sk_buff *skb) { - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_frag_unref(skb, i); - } + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; + if (skb->cloned && + atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &shinfo->dataref)) + return; - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i]); - if (skb_has_frag_list(skb)) - skb_drop_fraglist(skb); + /* + * If skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { + struct ubuf_info *uarg; - skb_free_head(skb); + uarg = shinfo->destructor_arg; + if (uarg->callback) + uarg->callback(uarg, true); } + + if (shinfo->frag_list) + kfree_skb_list(shinfo->frag_list); + + skb_free_head(skb); } /* -- cgit v1.2.3 From 24a2d43d8886f5a29c3cf108927f630c545a9a38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:55 -0700 Subject: ipv4: rename ip_options_echo to __ip_options_echo() ip_options_echo() assumes struct ip_options is provided in &IPCB(skb)->opt Lets break this assumption, but provide a helper to not change all call points. ip_send_unicast_reply() gets a new struct ip_options pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 15 ++++++++++++--- net/ipv4/ip_options.c | 6 ++---- net/ipv4/ip_output.c | 8 +++++--- net/ipv4/tcp_ipv4.c | 10 ++++++---- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index fcd9068fb8c3..0bb620702929 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) @@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb); void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); + +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt); +static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +{ + return __ip_options_echo(dopt, skb, &IPCB(skb)->opt); +} + void ip_options_fragment(struct sk_buff *skb); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ad382499bace..5b3d91be2db0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt) { - const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); - sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 215af2b155cb..c8fa62476461 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .uc_ttl = -1, }; -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len) { struct ip_options_data replyopts; @@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, struct sock *sk; struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt.opt, skb)) + if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b2e49cb2b61..28ab90382c01 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } -- cgit v1.2.3 From a224772db8420ecb7ce91a9ba5d535ee3a50d982 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:56 -0700 Subject: ipv6: add a struct inet6_skb_parm param to ipv6_opt_accepted() ipv6_opt_accepted() assumes IP6CB(skb) holds the struct inet6_skb_parm that it needs. Lets not assume this, as TCP stack might use a different place. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 ++- net/dccp/ipv6.c | 2 +- net/ipv6/af_inet6.c | 4 ++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 5 +++-- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7e247e9b8765..97f472012438 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb); +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 04cb17d4b0ce..ad2acfe1ca61 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e4865a3ebe1d..34f726f59814 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); - const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c643dc907ce7..9a2838e93cc5 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index de51a88bec6f..9400b4326f22 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ireq->ir_iif = inet6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || + (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { atomic_inc(&skb->users); @@ -1367,7 +1368,7 @@ ipv6_pktoptions: np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb)) { + if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); } else { -- cgit v1.2.3 From 971f10eca186cab238c49daa91f703c5a001b0b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:57 -0700 Subject: tcp: better TCP_SKB_CB layout to reduce cache line misses TCP maintains lists of skb in write queue, and in receive queues (in order and out of order queues) Scanning these lists both in input and output path usually requires access to skb->next, TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq These fields are currently in two different cache lines, meaning we waste lot of memory bandwidth when these queues are big and flows have either packet drops or packet reorders. We can move TCP_SKB_CB(skb)->header at the end of TCP_SKB_CB, because this header is not used in fast path. This allows TCP to search much faster in the skb lists. Even with regular flows, we save one cache line miss in fast path. Thanks to Christoph Paasch for noticing we need to cleanup skb->cb[] (IPCB/IP6CB) before entering IP stack in tx path, and that I forgot IPCB use in tcp_v4_hnd_req() and tcp_v4_save_options(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 12 ++++++------ net/ipv4/tcp_ipv4.c | 19 ++++++++++++------- net/ipv4/tcp_output.c | 5 +++++ net/ipv6/tcp_ipv6.c | 7 +++++++ 4 files changed, 30 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index a4201ef216e8..4dc6641ee990 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ @@ -720,6 +714,12 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ __u32 ack_seq; /* Sequence number ACK'd */ + union { + struct inet_skb_parm h4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; /* For incoming frames */ }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 28ab90382c01..9ce3eac02957 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -886,18 +886,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action); */ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { - const struct ip_options *opt = &(IPCB(skb)->opt); + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt) { - if (ip_options_echo(&dopt->opt, skb)) { - kfree(dopt); - dopt = NULL; - } + if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; } } return dopt; @@ -1431,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); #endif return sk; } @@ -1636,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb) th = tcp_hdr(skb); iph = ip_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f173b1c4f815..a462fb1db896 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; + + /* Cleanup our debris for IP stacks */ + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9400b4326f22..132bac137aed 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1412,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), + sizeof(struct inet6_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); -- cgit v1.2.3 From cd7d8498c9a5d510c64db38d9f4f4fbc41790f09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Sep 2014 04:11:22 -0700 Subject: tcp: change tcp_skb_pcount() location Our goal is to access no more than one cache line access per skb in a write or receive queue when doing the various walks. After recent TCP_SKB_CB() reorganizations, it is almost done. Last part is tcp_skb_pcount() which currently uses skb_shinfo(skb)->gso_segs, which is a terrible choice, because it needs 3 cache lines in current kernel (skb->head, skb->end, and shinfo->gso_segs are all in 3 different cache lines, far from skb->cb) This very simple patch reuses space currently taken by tcp_tw_isn only in input path, as tcp_skb_pcount is only needed for skb stored in write queue. This considerably speeds up tcp_ack(), granted we avoid shinfo->tx_flags to get SKBTX_ACK_TSTAMP, which seems possible. This also speeds up all sack processing in general. This speeds up tcp_sendmsg() because it no longer has to access/dirty shinfo. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 23 +++++++++++++++++++++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 9 ++++++--- 4 files changed, 33 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4dc6641ee990..02a9a2c366bf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -698,7 +698,16 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ + union { + /* Note : tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) + * + * tcp_gso_segs is used in write queue only, + * cf tcp_skb_pcount() + */ + __u32 tcp_tw_isn; + __u32 tcp_gso_segs; + }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ @@ -746,7 +755,17 @@ TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_segs; + return TCP_SKB_CB(skb)->tcp_gso_segs; +} + +static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) +{ + TCP_SKB_CB(skb)->tcp_gso_segs = segs; +} + +static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) +{ + TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff tcp_skb_pcount() > 1. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 553b01f52f71..87289e51be00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -963,7 +963,7 @@ new_segment: skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; @@ -1261,7 +1261,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); from += copy; copied += copy; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3f016a15c5a..2c0af90231cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1295,9 +1295,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; - skb_shinfo(prev)->gso_segs += pcount; - BUG_ON(skb_shinfo(skb)->gso_segs < pcount); - skb_shinfo(skb)->gso_segs -= pcount; + tcp_skb_pcount_add(prev, pcount); + BUG_ON(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK @@ -1310,7 +1310,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (skb_shinfo(skb)->gso_segs <= 1) { + if (tcp_skb_pcount(skb) <= 1) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a462fb1db896..4d92703df4c6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -384,7 +384,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; @@ -972,6 +972,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1019,11 +1022,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; } else { - shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; shinfo->gso_type = sk->sk_gso_type; } -- cgit v1.2.3 From 155c6e1ad4a778cad7f9fe6695afc91b3f5fe1ac Mon Sep 17 00:00:00 2001 From: "Peter Pan(潘卫平)" Date: Wed, 24 Sep 2014 22:17:02 +0800 Subject: tcp: use tcp_flags in tcp_data_queue() This patch is a cleanup which follows the idea in commit e11ecddf5128 (tcp: use TCP_SKB_CB(skb)->tcp_flags in input path), and it may reduce register pressure since skb->cb[] access is fast, bacause skb is probably in a register. v2: remove variable th v3: reword the changelog Signed-off-by: Weiping Pan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2c0af90231cf..5073eefa6fae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4344,7 +4344,6 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; bool fragstolen = false; @@ -4353,7 +4352,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto drop; skb_dst_drop(skb); - __skb_pull(skb, th->doff * 4); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); TCP_ECN_accept_cwr(tp, skb); @@ -4397,7 +4396,7 @@ queue_and_out: tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) tcp_event_data_recv(sk, skb); - if (th->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); if (!skb_queue_empty(&tp->out_of_order_queue)) { -- cgit v1.2.3 From f7f1de51edbdd53b09061d12758cacd9901c363e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 24 Sep 2014 17:05:17 -0700 Subject: net: dsa: start and stop the PHY state machine dsa_slave_open() should start the PHY library state machine for its PHY interface, and dsa_slave_close() should stop the PHY library state machine accordingly. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 43c1e4ade689..4392e983abda 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -84,6 +84,9 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } + if (p->phy) + phy_start(p->phy); + return 0; clear_allmulti: @@ -101,6 +104,9 @@ static int dsa_slave_close(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + if (p->phy) + phy_stop(p->phy); + dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); if (dev->flags & IFF_ALLMULTI) -- cgit v1.2.3 From b2f2af21e37f6d12bd735c27da8942331aa9b3d7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 24 Sep 2014 17:05:18 -0700 Subject: net: dsa: allow enabling and disable switch ports Whenever a per-port network device is used/unused, invoke the switch driver port_enable/port_disable callbacks to allow saving as much power as possible by disabling unused parts of the switch (RX/TX logic, memory arrays, PHYs...). We supply a PHY device argument to make sure the switch driver can act on the PHY device if needed (like putting/taking the PHY out of deep low power mode). Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ net/dsa/slave.c | 14 ++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d8054fb4a4df..4f664fe0e42c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -224,6 +224,14 @@ struct dsa_switch_driver { */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); + + /* + * Port enable/disable + */ + int (*port_enable)(struct dsa_switch *ds, int port, + struct phy_device *phy); + void (*port_disable)(struct dsa_switch *ds, int port, + struct phy_device *phy); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4392e983abda..182d30ae6818 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -62,6 +62,7 @@ static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + struct dsa_switch *ds = p->parent; int err; if (!(master->flags & IFF_UP)) @@ -84,11 +85,20 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } + if (ds->drv->port_enable) { + err = ds->drv->port_enable(ds, p->port, p->phy); + if (err) + goto clear_promisc; + } + if (p->phy) phy_start(p->phy); return 0; +clear_promisc: + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(master, 0); clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(master, -1); @@ -103,6 +113,7 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + struct dsa_switch *ds = p->parent; if (p->phy) phy_stop(p->phy); @@ -117,6 +128,9 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); + if (ds->drv->port_disable) + ds->drv->port_disable(ds, p->port, p->phy); + return 0; } -- cgit v1.2.3 From 7905288f093ad924004609bb89a7ce1597892726 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 24 Sep 2014 17:05:21 -0700 Subject: net: dsa: allow switches driver to implement get/set EEE Allow switches driver to query and enable/disable EEE on a per-port basis by implementing the ethtool_{get,set}_eee settings and delegating these operations to the switch driver. set_eee() will need to coordinate with the PHY driver to make sure that EEE is enabled, the link-partner supports it and the auto-negotiation result is satisfactory. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 9 +++++++++ net/dsa/slave.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4f664fe0e42c..58ad8c6492db 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -232,6 +232,15 @@ struct dsa_switch_driver { struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port, struct phy_device *phy); + + /* + * EEE setttings + */ + int (*set_eee)(struct dsa_switch *ds, int port, + struct phy_device *phydev, + struct ethtool_eee *e); + int (*get_eee)(struct dsa_switch *ds, int port, + struct ethtool_eee *e); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 182d30ae6818..36953c84ff2d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -342,6 +342,44 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) return ret; } +static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + if (!ds->drv->set_eee) + return -EOPNOTSUPP; + + ret = ds->drv->set_eee(ds, p->port, p->phy, e); + if (ret) + return ret; + + if (p->phy) + ret = phy_ethtool_set_eee(p->phy, e); + + return ret; +} + +static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + if (!ds->drv->get_eee) + return -EOPNOTSUPP; + + ret = ds->drv->get_eee(ds, p->port, e); + if (ret) + return ret; + + if (p->phy) + ret = phy_ethtool_get_eee(p->phy, e); + + return ret; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -353,6 +391,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_sset_count = dsa_slave_get_sset_count, .set_wol = dsa_slave_set_wol, .get_wol = dsa_slave_get_wol, + .set_eee = dsa_slave_set_eee, + .get_eee = dsa_slave_get_eee, }; static const struct net_device_ops dsa_slave_netdev_ops = { -- cgit v1.2.3 From 18d0264f630e200772bf236ac5747c47e908501e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Sep 2014 10:26:37 -0700 Subject: net_sched: remove the first parameter from tcf_exts_destroy() Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_basic.c | 4 ++-- net/sched/cls_bpf.c | 4 ++-- net/sched/cls_cgroup.c | 6 +++--- net/sched/cls_flow.c | 4 ++-- net/sched/cls_fw.c | 4 ++-- net/sched/cls_route.c | 4 ++-- net/sched/cls_rsvp.h | 4 ++-- net/sched/cls_tcindex.c | 4 ++-- net/sched/cls_u32.c | 4 ++-- 11 files changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6da46dcf1049..73f9532ee581 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -137,7 +137,7 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr); -void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts); +void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e547efdaba9d..77147c8c4acc 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -496,7 +496,7 @@ out: return skb->len; } -void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 1937298d6775..fe20826a79e7 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -94,7 +94,7 @@ static void basic_delete_filter(struct rcu_head *head) struct tcf_proto *tp = f->tp; tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(tp, &f->ematches); kfree(f); } @@ -161,7 +161,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 4e3f5bfc0b26..4318d067b0a0 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -93,7 +93,7 @@ static int cls_bpf_init(struct tcf_proto *tp) static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { tcf_unbind_filter(tp, &prog->res); - tcf_exts_destroy(tp, &prog->exts); + tcf_exts_destroy(&prog->exts); bpf_prog_destroy(prog->filter); @@ -217,7 +217,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, errout_free: kfree(bpf_ops); errout: - tcf_exts_destroy(tp, &exts); + tcf_exts_destroy(&exts); return ret; } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 15c34d4ccd9e..3409f168225f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -86,7 +86,7 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root) struct cls_cgroup_head, rcu); - tcf_exts_destroy(head->tp, &head->exts); + tcf_exts_destroy(&head->exts); tcf_em_tree_destroy(head->tp, &head->ematches); kfree(head); } @@ -135,7 +135,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); if (err < 0) { - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); goto errout; } @@ -156,7 +156,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (head) { - tcf_exts_destroy(tp, &head->exts); + tcf_exts_destroy(&head->exts); tcf_em_tree_destroy(tp, &head->ematches); RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 95736fa479f3..f18d27f7b5f2 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -354,7 +354,7 @@ static void flow_destroy_filter(struct rcu_head *head) struct flow_filter *f = container_of(head, struct flow_filter, rcu); del_timer_sync(&f->perturb_timer); - tcf_exts_destroy(f->tp, &f->exts); + tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(f->tp, &f->ematches); kfree(f); } @@ -533,7 +533,7 @@ err2: tcf_em_tree_destroy(tp, &t); kfree(fnew); err1: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 2650285620ee..da805aeeb65c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -126,7 +126,7 @@ static void fw_delete_filter(struct rcu_head *head) struct tcf_proto *tp = f->tp; tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); kfree(f); } @@ -223,7 +223,7 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index ba96deacf27c..b665aee661f7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -272,7 +272,7 @@ route4_delete_filter(struct rcu_head *head) struct tcf_proto *tp = f->tp; tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); kfree(f); } @@ -456,7 +456,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index b044c208b133..1c64a09753c4 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -264,7 +264,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + tcf_exts_destroy(&f->exts); kfree_rcu(f, rcu); } @@ -577,7 +577,7 @@ insert: errout: kfree(f); errout2: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 5054fae33a48..e3c6fa3ea3d2 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -169,7 +169,7 @@ found: rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); - tcf_exts_destroy(tp, &r->exts); + tcf_exts_destroy(&r->exts); if (f) kfree_rcu(f, rcu); return 0; @@ -401,7 +401,7 @@ errout_alloc: kfree(cp->h); errout: kfree(cp); - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ef97a646ee94..4be3ebf46d67 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -359,7 +359,7 @@ static int u32_destroy_key(struct tcf_proto *tp, bool free_pf) { tcf_unbind_filter(tp, &n->res); - tcf_exts_destroy(tp, &n->exts); + tcf_exts_destroy(&n->exts); if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF @@ -606,7 +606,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } -- cgit v1.2.3 From 825bae5d97d351ddf9720400df133fc4b1bc6a13 Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Thu, 25 Sep 2014 10:55:28 -0700 Subject: arp: Do not perturb drop profiles with ignored ARP packets We do not wish to disturb dropwatch or perf drop profiles with an ARP we will ignore. Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- net/ipv4/arp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1a9b99e04465..16acb59d665e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -953,10 +953,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, { const struct arphdr *arp; + /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) - goto freeskb; + goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) @@ -974,6 +975,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); +consumeskb: + consume_skb(skb); + return 0; freeskb: kfree_skb(skb); out_of_mem: -- cgit v1.2.3 From 02c5e84413dae4aa650536097d4195a356217d3d Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Sep 2014 12:06:04 -0700 Subject: net_sched: fix errno in tcindex_set_parms() When kmemdup() fails, we should return -ENOMEM. Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e3c6fa3ea3d2..365b23b928f4 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -237,15 +237,14 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (err < 0) return err; + err = -ENOMEM; /* tcindex_data attributes must look atomic to classifier/lookup so * allocate new tcindex data and RCU assign it onto root. Keeping * perfect hash and hash pointers from old data. */ cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - err = -ENOMEM; + if (!cp) goto errout; - } cp->mask = p->mask; cp->shift = p->shift; -- cgit v1.2.3 From 68f6a7c6c9817f2e6a66b59893de3c901ae5608c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Sep 2014 12:06:05 -0700 Subject: net_sched: fix another regression in cls_tcindex Clearly the following change is not expected: - if (!cp.perfect && !cp.h) - cp.alloc_hash = cp.hash; + if (!cp->perfect && cp->h) + cp->alloc_hash = cp->hash; Fixes: commit 331b72922c5f58d48fd ("net: sched: RCU cls_tcindex") Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 365b23b928f4..8d0e83d6903e 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -303,7 +303,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->hash = DEFAULT_HASH_SIZE; } - if (!cp->perfect && cp->h) + if (!cp->perfect && !cp->h) cp->alloc_hash = cp->hash; /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) -- cgit v1.2.3 From 53dfd501819a6e9c3a7d56cac1ddaf03fe90800d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Sep 2014 10:02:50 -0700 Subject: net: sched: cls_rcvp, complete rcu conversion This completes the cls_rsvp conversion to RCU safe copy, update semantics. As a result all cases of tcf_exts_change occur on empty lists now. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_rsvp.h | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 1c64a09753c4..6bb55f277a5a 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -222,6 +222,33 @@ matched: return -1; } +static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) +{ + struct rsvp_head *head = rtnl_dereference(tp->root); + struct rsvp_session *s; + struct rsvp_filter __rcu **ins; + struct rsvp_filter *pins; + unsigned int h1 = h & 0xFF; + unsigned int h2 = (h >> 8) & 0xFF; + + for (s = rtnl_dereference(head->ht[h1]); s; + s = rtnl_dereference(s->next)) { + for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ; + ins = &pins->next, pins = rtnl_dereference(*ins)) { + if (pins->handle == h) { + RCU_INIT_POINTER(n->next, pins->next); + rcu_assign_pointer(*ins, n); + return; + } + } + } + + /* Something went wrong if we are trying to replace a non-existant + * node. Mind as well halt instead of silently failing. + */ + BUG_ON(1); +} + static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) { struct rsvp_head *head = rtnl_dereference(tp->root); @@ -454,15 +481,26 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, f = (struct rsvp_filter *)*arg; if (f) { /* Node exists: adjust only classid */ + struct rsvp_filter *n; if (f->handle != handle && handle) goto errout2; + + n = kmemdup(f, sizeof(*f), GFP_KERNEL); + if (!n) { + err = -ENOMEM; + goto errout2; + } + + tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (tb[TCA_RSVP_CLASSID]) { - f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - tcf_bind_filter(tp, &f->res, base); + n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); + tcf_bind_filter(tp, &n->res, base); } - tcf_exts_change(tp, &f->exts, &e); + tcf_exts_change(tp, &n->exts, &e); + rsvp_replace(tp, n, handle); return 0; } -- cgit v1.2.3 From 55d8694fa82c9b5858ae5a78a210353961f908f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:32 +0200 Subject: net: tcp: assign tcp cong_ops when tcp sk is created Split assignment and initialization from one into two functions. This is required by followup patches that add Datacenter TCP (DCTCP) congestion control algorithm - we need to be able to determine if the connection is moderated by DCTCP before the 3WHS has finished. As we walk the available congestion control list during the assignment, we are always guaranteed to have Reno present as it's fixed compiled-in. Therefore, since we're doing the early assignment, we don't have a real use for the Reno alias tcp_init_congestion_ops anymore and can thus remove it. Actual usage of the congestion control operations are being made after the 3WHS has finished, in some cases however we can access get_info() via diag if implemented, therefore we need to zero out the private area for those modules. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 6 ++---- net/ipv4/tcp_cong.c | 46 ++++++++++++++++++++++------------------------ net/ipv4/tcp_minisocks.c | 5 ++--- 4 files changed, 27 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 02a9a2c366bf..f99b0c072ee5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -824,6 +824,7 @@ struct tcp_congestion_ops { int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); +void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(const char *name); @@ -835,7 +836,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name); int tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); -extern struct tcp_congestion_ops tcp_init_congestion_ops; u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87289e51be00..cf5e508e1ef5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk) tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); - icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -3258,8 +3258,6 @@ void __init tcp_init(void) tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_metrics_init(); - - tcp_register_congestion_control(&tcp_reno); - + BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 80248f56c89f..a6c8a5775624 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct sock *sk) +void tcp_assign_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - /* if no choice made yet assign the current value set as default */ - if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - icsk->icsk_ca_ops = ca; - break; - } - - /* fallback to next available */ + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (likely(try_module_get(ca->owner))) { + icsk->icsk_ca_ops = ca; + goto out; } - rcu_read_unlock(); + /* Fallback to next available. The last really + * guaranteed fallback is Reno from this list. + */ } +out: + rcu_read_unlock(); + + /* Clear out private data before diag gets it and + * the ca has not been initialized. + */ + if (ca->get_info) + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); +} + +void tcp_init_congestion_control(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, }; - -/* Initial congestion control used (until SYN) - * really reno under another name so we can tell difference - * during tcp_set_default_congestion_control - */ -struct tcp_congestion_ops tcp_init_congestion_ops = { - .name = "", - .owner = THIS_MODULE, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, -}; -EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a058f411d3a6..47b73506b77e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && - !try_module_get(newicsk->icsk_ca_ops->owner)) - newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + if (!try_module_get(newicsk->icsk_ca_ops->owner)) + tcp_assign_congestion_control(newsk); tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); -- cgit v1.2.3 From 30e502a34b8b21fae2c789da102bd9f6e99fef83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:33 +0200 Subject: net: tcp: add flag for ca to indicate that ECN is required This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 61 +++++++++++++++++++++++++++++++++++++-------------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 25 +++++++++++++++------ 3 files changed, 63 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index f99b0c072ee5..a12f145cfbc3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -733,23 +733,6 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a network is - * playing bad games with TOS bits. In order to avoid possible false congestion - * notifications, we disable TCP ECN negociation. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - struct net *net) -{ - const struct tcphdr *th = tcp_hdr(skb); - - if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && - INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) - inet_rsk(req)->ecn_ok = 1; -} - /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ @@ -791,7 +774,10 @@ enum tcp_ca_event { #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 +/* Requires ECN/ECT set on all packets */ +#define TCP_CONG_NEEDS_ECN 0x2 struct tcp_congestion_ops { struct list_head list; @@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +static inline bool tcp_ca_needs_ecn(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; +} + static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static inline void +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5073eefa6fae..fb0fe97e1c54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); + TCP_ECN_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d92703df4c6..20e73271d75c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ @@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk)) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } } static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, + struct sock *sk) { - if (inet_rsk(req)->ecn_ok) + if (inet_rsk(req)->ecn_ok) { th->ece = 1; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else { + } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(tcp_sk(sk), skb); + TCP_ECN_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th); + TCP_ECN_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is -- cgit v1.2.3 From 7354c8c389d18719dd71cc810da70b0921d66694 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:34 +0200 Subject: net: tcp: split ack slow/fast events from cwnd_event The congestion control ops "cwnd_event" currently supports CA_EVENT_FAST_ACK and CA_EVENT_SLOW_ACK events (among others). Both FAST and SLOW_ACK are only used by Westwood congestion control algorithm. This removes both flags from cwnd_event and adds a new in_ack_event callback for this. The goal is to be able to provide more detailed information about ACKs, such as whether ECE flag was set, or whether the ACK resulted in a window update. It is required for DataCenter TCP (DCTCP) congestion control algorithm as it makes a different choice depending on ECE being set or not. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++-- net/ipv4/tcp_input.c | 12 ++++++++++-- net/ipv4/tcp_westwood.c | 28 ++++++++++++++++------------ 3 files changed, 32 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index a12f145cfbc3..7ec6a28305c0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,8 +763,10 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_FAST_ACK, /* in sequence ack */ - CA_EVENT_SLOW_ACK, /* other ack */ +}; + +enum tcp_ca_ack_event_flags { + CA_ACK_SLOWPATH = (1 << 0), }; /* @@ -796,6 +798,8 @@ struct tcp_congestion_ops { void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ + void (*in_ack_event)(struct sock *sk, u32 flags); /* new value of cwnd after loss (optional) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb0fe97e1c54..8a38774cc66e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3362,6 +3362,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } +static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->in_ack_event) + icsk->icsk_ca_ops->in_ack_event(sk, flags); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3421,7 +3429,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(sk, CA_EVENT_FAST_ACK); + tcp_in_ack_event(sk, 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { @@ -3439,7 +3447,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + tcp_in_ack_event(sk, CA_ACK_SLOWPATH); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 81911a92356c..bb63fba47d47 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } +static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) +{ + if (ack_flags & CA_ACK_SLOWPATH) { + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + + update_rtt_min(w); + return; + } + + westwood_fast_bw(sk); +} + static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { - case CA_EVENT_FAST_ACK: - westwood_fast_bw(sk); - break; - case CA_EVENT_COMPLETE_CWR: tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; - - case CA_EVENT_SLOW_ACK: - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); - update_rtt_min(w); - break; - default: /* don't care */ break; @@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .cwnd_event = tcp_westwood_event, + .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.2.3 From 9890092e46b2996bb85f7f973e69424cb5c07bc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:35 +0200 Subject: net: tcp: more detailed ACK events and events for CE marked packets DataCenter TCP (DCTCP) determines cwnd growth based on ECN information and ACK properties, e.g. ACK that updates window is treated differently than DUPACK. Also DCTCP needs information whether ACK was delayed ACK. Furthermore, DCTCP also implements a CE state machine that keeps track of CE markings of incoming packets. Therefore, extend the congestion control framework to provide these event types, so that DCTCP can be properly implemented as a normal congestion algorithm module outside of the core stack. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++++++- net/ipv4/tcp_input.c | 22 ++++++++++++++++++---- net/ipv4/tcp_output.c | 4 ++++ 3 files changed, 30 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7ec6a28305c0..1f57c5363492 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,10 +763,17 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ + CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ + CA_EVENT_NON_DELAYED_ACK, }; +/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), + CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ + CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ + CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a38774cc66e..fc133178c787 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } - /* fallinto */ + tp->ecn_flags |= TCP_ECN_SEEN; + break; default: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; + break; } } @@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, 0); + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { + u32 ack_ev_flags = CA_ACK_SLOWPATH; + if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; + ack_ev_flags |= CA_ACK_ECE; + } + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_SLOWPATH); + tcp_in_ack_event(sk, ack_ev_flags); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20e73271d75c..124f9e4e4594 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; + tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); + if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; + tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); + /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. -- cgit v1.2.3 From e3118e8359bb7c59555aca60c725106e6d78c5ce Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:36 +0200 Subject: net: tcp: add DCTCP congestion control algorithm This work adds the DataCenter TCP (DCTCP) congestion control algorithm [1], which has been first published at SIGCOMM 2010 [2], resp. follow-up analysis at SIGMETRICS 2011 [3] (and also, more recently as an informational IETF draft available at [4]). DCTCP is an enhancement to the TCP congestion control algorithm for data center networks. Typical data center workloads are i.e. i) partition/aggregate (queries; bursty, delay sensitive), ii) short messages e.g. 50KB-1MB (for coordination and control state; delay sensitive), and iii) large flows e.g. 1MB-100MB (data update; throughput sensitive). DCTCP has therefore been designed for such environments to provide/achieve the following three requirements: * High burst tolerance (incast due to partition/aggregate) * Low latency (short flows, queries) * High throughput (continuous data updates, large file transfers) with commodity, shallow buffered switches The basic idea of its design consists of two fundamentals: i) on the switch side, packets are being marked when its internal queue length > threshold K (K is chosen so that a large enough headroom for marked traffic is still available in the switch queue); ii) the sender/host side maintains a moving average of the fraction of marked packets, so each RTT, F is being updated as follows: F := X / Y, where X is # of marked ACKs, Y is total # of ACKs alpha := (1 - g) * alpha + g * F, where g is a smoothing constant The resulting alpha (iow: probability that switch queue is congested) is then being used in order to adaptively decrease the congestion window W: W := (1 - (alpha / 2)) * W The means for receiving marked packets resp. marking them on switch side in DCTCP is the use of ECN. RFC3168 describes a mechanism for using Explicit Congestion Notification from the switch for early detection of congestion, rather than waiting for segment loss to occur. However, this method only detects the presence of congestion, not the *extent*. In the presence of mild congestion, it reduces the TCP congestion window too aggressively and unnecessarily affects the throughput of long flows [4]. DCTCP, as mentioned, enhances Explicit Congestion Notification (ECN) processing to estimate the fraction of bytes that encounter congestion, rather than simply detecting that some congestion has occurred. DCTCP then scales the TCP congestion window based on this estimate [4], thus it can derive multibit feedback from the information present in the single-bit sequence of marks in its control law. And thus act in *proportion* to the extent of congestion, not its *presence*. Switches therefore set the Congestion Experienced (CE) codepoint in packets when internal queue lengths exceed threshold K. Resulting, DCTCP delivers the same or better throughput than normal TCP, while using 90% less buffer space. It was found in [2] that DCTCP enables the applications to handle 10x the current background traffic, without impacting foreground traffic. Moreover, a 10x increase in foreground traffic did not cause any timeouts, and thus largely eliminates TCP incast collapse problems. The algorithm itself has already seen deployments in large production data centers since then. We did a long-term stress-test and analysis in a data center, short summary of our TCP incast tests with iperf compared to cubic: This test measured DCTCP throughput and latency and compared it with CUBIC throughput and latency for an incast scenario. In this test, 19 senders sent at maximum rate to a single receiver. The receiver simply ran iperf -s. The senders ran iperf -c -t 30. All senders started simultaneously (using local clocks synchronized by ntp). This test was repeated multiple times. Below shows the results from a single test. Other tests are similar. (DCTCP results were extremely consistent, CUBIC results show some variance induced by the TCP timeouts that CUBIC encountered.) For this test, we report statistics on the number of TCP timeouts, flow throughput, and traffic latency. 1) Timeouts (total over all flows, and per flow summaries): CUBIC DCTCP Total 3227 25 Mean 169.842 1.316 Median 183 1 Max 207 5 Min 123 0 Stddev 28.991 1.600 Timeout data is taken by measuring the net change in netstat -s "other TCP timeouts" reported. As a result, the timeout measurements above are not restricted to the test traffic, and we believe that it is likely that all of the "DCTCP timeouts" are actually timeouts for non-test traffic. We report them nevertheless. CUBIC will also include some non-test timeouts, but they are drawfed by bona fide test traffic timeouts for CUBIC. Clearly DCTCP does an excellent job of preventing TCP timeouts. DCTCP reduces timeouts by at least two orders of magnitude and may well have eliminated them in this scenario. 2) Throughput (per flow in Mbps): CUBIC DCTCP Mean 521.684 521.895 Median 464 523 Max 776 527 Min 403 519 Stddev 105.891 2.601 Fairness 0.962 0.999 Throughput data was simply the average throughput for each flow reported by iperf. By avoiding TCP timeouts, DCTCP is able to achieve much better per-flow results. In CUBIC, many flows experience TCP timeouts which makes flow throughput unpredictable and unfair. DCTCP, on the other hand, provides very clean predictable throughput without incurring TCP timeouts. Thus, the standard deviation of CUBIC throughput is dramatically higher than the standard deviation of DCTCP throughput. Mean throughput is nearly identical because even though cubic flows suffer TCP timeouts, other flows will step in and fill the unused bandwidth. Note that this test is something of a best case scenario for incast under CUBIC: it allows other flows to fill in for flows experiencing a timeout. Under situations where the receiver is issuing requests and then waiting for all flows to complete, flows cannot fill in for timed out flows and throughput will drop dramatically. 3) Latency (in ms): CUBIC DCTCP Mean 4.0088 0.04219 Median 4.055 0.0395 Max 4.2 0.085 Min 3.32 0.028 Stddev 0.1666 0.01064 Latency for each protocol was computed by running "ping -i 0.2 " from a single sender to the receiver during the incast test. For DCTCP, "ping -Q 0x6 -i 0.2 " was used to ensure that traffic traversed the DCTCP queue and was not dropped when the queue size was greater than the marking threshold. The summary statistics above are over all ping metrics measured between the single sender, receiver pair. The latency results for this test show a dramatic difference between CUBIC and DCTCP. CUBIC intentionally overflows the switch buffer which incurs the maximum queue latency (more buffer memory will lead to high latency.) DCTCP, on the other hand, deliberately attempts to keep queue occupancy low. The result is a two orders of magnitude reduction of latency with DCTCP - even with a switch with relatively little RAM. Switches with larger amounts of RAM will incur increasing amounts of latency for CUBIC, but not for DCTCP. 4) Convergence and stability test: This test measured the time that DCTCP took to fairly redistribute bandwidth when a new flow commences. It also measured DCTCP's ability to remain stable at a fair bandwidth distribution. DCTCP is compared with CUBIC for this test. At the commencement of this test, a single flow is sending at maximum rate (near 10 Gbps) to a single receiver. One second after that first flow commences, a new flow from a distinct server begins sending to the same receiver as the first flow. After the second flow has sent data for 10 seconds, the second flow is terminated. The first flow sends for an additional second. Ideally, the bandwidth would be evenly shared as soon as the second flow starts, and recover as soon as it stops. The results of this test are shown below. Note that the flow bandwidth for the two flows was measured near the same time, but not simultaneously. DCTCP performs nearly perfectly within the measurement limitations of this test: bandwidth is quickly distributed fairly between the two flows, remains stable throughout the duration of the test, and recovers quickly. CUBIC, in contrast, is slow to divide the bandwidth fairly, and has trouble remaining stable. CUBIC DCTCP Seconds Flow 1 Flow 2 Seconds Flow 1 Flow 2 0 9.93 0 0 9.92 0 0.5 9.87 0 0.5 9.86 0 1 8.73 2.25 1 6.46 4.88 1.5 7.29 2.8 1.5 4.9 4.99 2 6.96 3.1 2 4.92 4.94 2.5 6.67 3.34 2.5 4.93 5 3 6.39 3.57 3 4.92 4.99 3.5 6.24 3.75 3.5 4.94 4.74 4 6 3.94 4 5.34 4.71 4.5 5.88 4.09 4.5 4.99 4.97 5 5.27 4.98 5 4.83 5.01 5.5 4.93 5.04 5.5 4.89 4.99 6 4.9 4.99 6 4.92 5.04 6.5 4.93 5.1 6.5 4.91 4.97 7 4.28 5.8 7 4.97 4.97 7.5 4.62 4.91 7.5 4.99 4.82 8 5.05 4.45 8 5.16 4.76 8.5 5.93 4.09 8.5 4.94 4.98 9 5.73 4.2 9 4.92 5.02 9.5 5.62 4.32 9.5 4.87 5.03 10 6.12 3.2 10 4.91 5.01 10.5 6.91 3.11 10.5 4.87 5.04 11 8.48 0 11 8.49 4.94 11.5 9.87 0 11.5 9.9 0 SYN/ACK ECT test: This test demonstrates the importance of ECT on SYN and SYN-ACK packets by measuring the connection probability in the presence of competing flows for a DCTCP connection attempt *without* ECT in the SYN packet. The test was repeated five times for each number of competing flows. Competing Flows 1 | 2 | 4 | 8 | 16 ------------------------------ Mean Connection Probability 1 | 0.67 | 0.45 | 0.28 | 0 Median Connection Probability 1 | 0.65 | 0.45 | 0.25 | 0 As the number of competing flows moves beyond 1, the connection probability drops rapidly. Enabling DCTCP with this patch requires the following steps: DCTCP must be running both on the sender and receiver side in your data center, i.e.: sysctl -w net.ipv4.tcp_congestion_control=dctcp Also, ECN functionality must be enabled on all switches in your data center for DCTCP to work. The default ECN marking threshold (K) heuristic on the switch for DCTCP is e.g., 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps (K > 1/7 * C * RTT, [4]). In above tests, for each switch port, traffic was segregated into two queues. For any packet with a DSCP of 0x01 - or equivalently a TOS of 0x04 - the packet was placed into the DCTCP queue. All other packets were placed into the default drop-tail queue. For the DCTCP queue, RED/ECN marking was enabled, here, with a marking threshold of 75 KB. More details however, we refer you to the paper [2] under section 3). There are no code changes required to applications running in user space. DCTCP has been implemented in full *isolation* of the rest of the TCP code as its own congestion control module, so that it can run without a need to expose code to the core of the TCP stack, and thus nothing changes for non-DCTCP users. Changes in the CA framework code are minimal, and DCTCP algorithm operates on mechanisms that are already available in most Silicon. The gain (dctcp_shift_g) is currently a fixed constant (1/16) from the paper, but we leave the option that it can be chosen carefully to a different value by the user. In case DCTCP is being used and ECN support on peer site is off, DCTCP falls back after 3WHS to operate in normal TCP Reno mode. ss {-4,-6} -t -i diag interface: ... dctcp wscale:7,7 rto:203 rtt:2.349/0.026 mss:1448 cwnd:2054 ssthresh:1102 ce_state 0 alpha 15 ab_ecn 0 ab_tot 735584 send 10129.2Mbps pacing_rate 20254.1Mbps unacked:1822 retrans:0/15 reordering:101 rcv_space:29200 ... dctcp-reno wscale:7,7 rto:201 rtt:0.711/1.327 ato:40 mss:1448 cwnd:10 ssthresh:1102 fallback_mode send 162.9Mbps pacing_rate 325.5Mbps rcv_rtt:1.5 rcv_space:29200 More information about DCTCP can be found in [1-4]. [1] http://simula.stanford.edu/~alizade/Site/DCTCP.html [2] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf [3] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf [4] http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/dctcp.txt | 43 +++++ include/uapi/linux/inet_diag.h | 13 +- net/ipv4/Kconfig | 26 ++- net/ipv4/Makefile | 1 + net/ipv4/tcp_dctcp.c | 344 +++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_output.c | 1 + 6 files changed, 425 insertions(+), 3 deletions(-) create mode 100644 Documentation/networking/dctcp.txt create mode 100644 net/ipv4/tcp_dctcp.c (limited to 'net') diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt new file mode 100644 index 000000000000..0d5dfbc89ec9 --- /dev/null +++ b/Documentation/networking/dctcp.txt @@ -0,0 +1,43 @@ +DCTCP (DataCenter TCP) +---------------------- + +DCTCP is an enhancement to the TCP congestion control algorithm for data +center networks and leverages Explicit Congestion Notification (ECN) in +the data center network to provide multi-bit feedback to the end hosts. + +To enable it on end hosts: + + sysctl -w net.ipv4.tcp_congestion_control=dctcp + +All switches in the data center network running DCTCP must support ECN +marking and be configured for marking when reaching defined switch buffer +thresholds. The default ECN marking threshold heuristic for DCTCP on +switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, +but might need further careful tweaking. + +For more details, see below documents: + +Paper: + +The algorithm is further described in detail in the following two +SIGCOMM/SIGMETRICS papers: + + i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + "Data Center TCP (DCTCP)", Data Center Networks session + Proc. ACM SIGCOMM, New Delhi, 2010. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 + +ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + "Analysis of DCTCP: Stability, Convergence, and Fairness" + Proc. ACM SIGMETRICS, San Jose, 2011. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + +IETF informational draft: + + http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 + +DCTCP site: + + http://simula.stanford.edu/~alizade/Site/DCTCP.html diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbde90fa5838..d65c0a09efd3 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -110,10 +110,10 @@ enum { INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, + INET_DIAG_DCTCPINFO, }; -#define INET_DIAG_MAX INET_DIAG_SHUTDOWN - +#define INET_DIAG_MAX INET_DIAG_DCTCPINFO /* INET_DIAG_MEM */ @@ -133,5 +133,14 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; +/* INET_DIAG_DCTCPINFO */ + +struct tcp_dctcp_info { + __u16 dctcp_enabled; + __u16 dctcp_ce_state; + __u32 dctcp_alpha; + __u32 dctcp_ab_ecn; + __u32 dctcp_ab_tot; +}; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 84f710b7472a..69fb37854449 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +config TCP_CONG_DCTCP + tristate "DataCenter TCP (DCTCP)" + default n + ---help--- + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: + + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. + + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. + + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -598,9 +619,11 @@ choice config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y + config DEFAULT_DCTCP + bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_RENO bool "Reno" - endchoice endif @@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO + default "dctcp" if DEFAULT_DCTCP default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d78d404c596f..d8105787c199 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c @@ -0,0 +1,344 @@ +/* DataCenter TCP (DCTCP) congestion control. + * + * http://simula.stanford.edu/~alizade/Site/DCTCP.html + * + * This is an implementation of DCTCP over Reno, an enhancement to the + * TCP congestion control algorithm designed for data centers. DCTCP + * leverages Explicit Congestion Notification (ECN) in the network to + * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet + * the following three data center transport requirements: + * + * - High burst tolerance (incast due to partition/aggregate) + * - Low latency (short flows, queries) + * - High throughput (continuous data updates, large file transfers) + * with commodity shallow buffered switches + * + * The algorithm is described in detail in the following two papers: + * + * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + * "Data Center TCP (DCTCP)", Data Center Networks session + * Proc. ACM SIGCOMM, New Delhi, 2010. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + * + * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + * "Analysis of DCTCP: Stability, Convergence, and Fairness" + * Proc. ACM SIGMETRICS, San Jose, 2011. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + * + * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. + * + * Authors: + * + * Daniel Borkmann + * Florian Westphal + * Glenn Judd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include + +#define DCTCP_MAX_ALPHA 1024U + +struct dctcp { + u32 acked_bytes_ecn; + u32 acked_bytes_total; + u32 prior_snd_una; + u32 prior_rcv_nxt; + u32 dctcp_alpha; + u32 next_seq; + u32 ce_state; + u32 delayed_ack_reserved; +}; + +static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ +module_param(dctcp_shift_g, uint, 0644); +MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); + +static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; +module_param(dctcp_alpha_on_init, uint, 0644); +MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); + +static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; +module_param(dctcp_clamp_alpha_on_loss, uint, 0644); +MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, + "parameter for clamping alpha on loss"); + +static struct tcp_congestion_ops dctcp_reno; + +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +{ + ca->next_seq = tp->snd_nxt; + + ca->acked_bytes_ecn = 0; + ca->acked_bytes_total = 0; +} + +static void dctcp_init(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if ((tp->ecn_flags & TCP_ECN_OK) || + (sk->sk_state == TCP_LISTEN || + sk->sk_state == TCP_CLOSE)) { + struct dctcp *ca = inet_csk_ca(sk); + + ca->prior_snd_una = tp->snd_una; + ca->prior_rcv_nxt = tp->rcv_nxt; + + ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); + + ca->delayed_ack_reserved = 0; + ca->ce_state = 0; + + dctcp_reset(tp, ca); + return; + } + + /* No ECN support? Fall back to Reno. Also need to clear + * ECT from sk since it is set during 3WHS for DCTCP. + */ + inet_csk(sk)->icsk_ca_ops = &dctcp_reno; + INET_ECN_dontxmit(sk); +} + +static u32 dctcp_ssthresh(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ + +static void dctcp_ce_state_0_to_1(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ + if (!ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=0. */ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; + + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +} + +static void dctcp_ce_state_1_to_0(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ + if (ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=1. */ + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; + + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static void dctcp_update_alpha(struct sock *sk, u32 flags) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + u32 acked_bytes = tp->snd_una - ca->prior_snd_una; + + /* If ack did not advance snd_una, count dupack as MSS size. + * If ack did update window, do not count it at all. + */ + if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) + acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; + if (acked_bytes) { + ca->acked_bytes_total += acked_bytes; + ca->prior_snd_una = tp->snd_una; + + if (flags & CA_ACK_ECE) + ca->acked_bytes_ecn += acked_bytes; + } + + /* Expired RTT */ + if (!before(tp->snd_una, ca->next_seq)) { + /* For avoiding denominator == 1. */ + if (ca->acked_bytes_total == 0) + ca->acked_bytes_total = 1; + + /* alpha = (1 - g) * alpha + g * F */ + ca->dctcp_alpha = ca->dctcp_alpha - + (ca->dctcp_alpha >> dctcp_shift_g) + + (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / + ca->acked_bytes_total; + + if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) + /* Clamp dctcp_alpha to max. */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + + dctcp_reset(tp, ca); + } +} + +static void dctcp_state(struct sock *sk, u8 new_state) +{ + if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { + struct dctcp *ca = inet_csk_ca(sk); + + /* If this extension is enabled, we clamp dctcp_alpha to + * max on packet loss; the motivation is that dctcp_alpha + * is an indicator to the extend of congestion and packet + * loss is an indicator of extreme congestion; setting + * this in practice turned out to be beneficial, and + * effectively assumes total congestion which reduces the + * window by half. + */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + } +} + +static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) +{ + struct dctcp *ca = inet_csk_ca(sk); + + switch (ev) { + case CA_EVENT_DELAYED_ACK: + if (!ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 1; + break; + case CA_EVENT_NON_DELAYED_ACK: + if (ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 0; + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ + switch (ev) { + case CA_EVENT_ECN_IS_CE: + dctcp_ce_state_0_to_1(sk); + break; + case CA_EVENT_ECN_NO_CE: + dctcp_ce_state_1_to_0(sk); + break; + case CA_EVENT_DELAYED_ACK: + case CA_EVENT_NON_DELAYED_ACK: + dctcp_update_ack_reserved(sk, ev); + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + /* Fill it also in case of VEGASINFO due to req struct limits. + * We can still correctly retrieve it later. + */ + if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_dctcp_info info; + + memset(&info, 0, sizeof(info)); + if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { + info.dctcp_enabled = 1; + info.dctcp_ce_state = (u16) ca->ce_state; + info.dctcp_alpha = ca->dctcp_alpha; + info.dctcp_ab_ecn = ca->acked_bytes_ecn; + info.dctcp_ab_tot = ca->acked_bytes_total; + } + + nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops dctcp __read_mostly = { + .init = dctcp_init, + .in_ack_event = dctcp_update_alpha, + .cwnd_event = dctcp_cwnd_event, + .ssthresh = dctcp_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .set_state = dctcp_state, + .get_info = dctcp_get_info, + .flags = TCP_CONG_NEEDS_ECN, + .owner = THIS_MODULE, + .name = "dctcp", +}; + +static struct tcp_congestion_ops dctcp_reno __read_mostly = { + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .get_info = dctcp_get_info, + .owner = THIS_MODULE, + .name = "dctcp-reno", +}; + +static int __init dctcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&dctcp); +} + +static void __exit dctcp_unregister(void) +{ + tcp_unregister_congestion_control(&dctcp); +} + +module_init(dctcp_register); +module_exit(dctcp_unregister); + +MODULE_AUTHOR("Daniel Borkmann "); +MODULE_AUTHOR("Florian Westphal "); +MODULE_AUTHOR("Glenn Judd "); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 124f9e4e4594..86a0216fcaa1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3211,6 +3211,7 @@ void tcp_send_ack(struct sock *sk) skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. -- cgit v1.2.3 From 59790aa2873cb3c32db02c777f08eb19faccf5fa Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 29 Sep 2014 10:55:46 +0300 Subject: Bluetooth: 6lowpan: Make sure skb exists before accessing it We need to make sure that the saved skb exists when resuming or suspending a CoC channel. This can happen if initial credits is 0 when channel is connected. Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 0920cb6ed572..5d3e6202da3d 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -950,6 +950,9 @@ static void chan_suspend_cb(struct l2cap_chan *chan) BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb); + if (!skb) + return; + lowpan_cb(skb)->status = -EAGAIN; } @@ -959,6 +962,9 @@ static void chan_resume_cb(struct l2cap_chan *chan) BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb); + if (!skb) + return; + lowpan_cb(skb)->status = 0; } -- cgit v1.2.3 From 9363dc4b599949bde338cdaba1cf7cac243e4e97 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Tue, 23 Sep 2014 13:30:41 +0200 Subject: netfilter: nf_tables: store and dump set policy We want to know in which cases the user explicitly sets the policy options. In that case, we also want to dump back the info. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4d86198d3d6..3d7292392fac 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -241,6 +241,7 @@ void nft_unregister_set(struct nft_set_ops *ops); * @dtype: data type (verdict or numeric type defined by userspace) * @size: maximum set size * @nelems: number of elements + * @policy: set parameterization (see enum nft_set_policies) * @ops: set ops * @flags: set flags * @klen: key length @@ -255,6 +256,7 @@ struct nft_set { u32 dtype; u32 size; u32 nelems; + u16 policy; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a476b9962155..19e79f0d9ad2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2344,6 +2344,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->policy != NFT_SET_POL_PERFORMANCE) { + if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) + goto nla_put_failure; + } + desc = nla_nest_start(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; @@ -2669,6 +2674,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; + set->policy = policy; err = ops->init(set, &desc, nla); if (err < 0) -- cgit v1.2.3 From db29a9508a9246e77087c5531e45b2c88ec6988b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 11:35:42 +0200 Subject: netfilter: conntrack: disable generic tracking for known protocols Given following iptables ruleset: -P FORWARD DROP -A FORWARD -m sctp --dport 9 -j ACCEPT -A FORWARD -p tcp --dport 80 -j ACCEPT -A FORWARD -p tcp -m conntrack -m state ESTABLISHED,RELATED -j ACCEPT One would assume that this allows SCTP on port 9 and TCP on port 80. Unfortunately, if the SCTP conntrack module is not loaded, this allows *all* SCTP communication, to pass though, i.e. -p sctp -j ACCEPT, which we think is a security issue. This is because on the first SCTP packet on port 9, we create a dummy "generic l4" conntrack entry without any port information (since conntrack doesn't know how to extract this information). All subsequent packets that are unknown will then be in established state since they will fallback to proto_generic and will match the 'generic' entry. Our originally proposed version [1] completely disabled generic protocol tracking, but Jozsef suggests to not track protocols for which a more suitable helper is available, hence we now mitigate the issue for in tree known ct protocol helpers only, so that at least NAT and direction information will still be preserved for others. [1] http://www.spinics.net/lists/netfilter-devel/msg33430.html Joint work with Daniel Borkmann. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_generic.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index d25f29377648..957c1db66652 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -14,6 +14,30 @@ static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; +static bool nf_generic_should_process(u8 proto) +{ + switch (proto) { +#ifdef CONFIG_NF_CT_PROTO_SCTP_MODULE + case IPPROTO_SCTP: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP_MODULE + case IPPROTO_DCCP: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE + case IPPROTO_GRE: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE_MODULE + case IPPROTO_UDPLITE: + return false; +#endif + default: + return true; + } +} + static inline struct nf_generic_net *generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; @@ -67,7 +91,7 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return true; + return nf_generic_should_process(nf_ct_protonum(ct)); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) -- cgit v1.2.3 From 36b3dd250dde5317fa6bb8c9010e1e7ab7f2265a Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 29 Sep 2014 16:37:25 +0300 Subject: Bluetooth: 6lowpan: Ensure header compression does not corrupt IPv6 header If skb is going to multiple destinations, then make sure that we do not overwrite the common IPv6 headers. So before compressing the IPv6 headers, we copy the skb and that is then sent to 6LoWPAN Bluetooth devices. This is a similar patch as what was done for IEEE 802.154 6LoWPAN in commit f19f4f9525cf3 ("ieee802154: 6lowpan: ensure header compression does not corrupt ipv6 header") Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 135 +++++++++++++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 5d3e6202da3d..2ec7c84c2000 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -426,38 +426,33 @@ static void convert_dest_bdaddr(struct in6_addr *ip6_daddr, *addr_type = get_addr_type_from_eui64(addr->b[5]); } -static int header_create(struct sk_buff *skb, struct net_device *netdev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) +static int setup_header(struct sk_buff *skb, struct net_device *netdev, + bdaddr_t *peer_addr, u8 *peer_addr_type) { - struct ipv6hdr *hdr; + struct in6_addr ipv6_daddr; struct lowpan_dev *dev; struct lowpan_peer *peer; bdaddr_t addr, *any = BDADDR_ANY; - u8 *saddr, *daddr = any->b; - u8 addr_type; - - if (type != ETH_P_IPV6) - return -EINVAL; - - hdr = ipv6_hdr(skb); + u8 *daddr = any->b; + int err, status = 0; dev = lowpan_dev(netdev); - if (ipv6_addr_is_multicast(&hdr->daddr)) { - memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, - sizeof(struct in6_addr)); + memcpy(&ipv6_daddr, &lowpan_cb(skb)->addr, sizeof(ipv6_daddr)); + + if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; } else { unsigned long flags; + u8 addr_type; /* Get destination BT device from skb. * If there is no such peer then discard the packet. */ - convert_dest_bdaddr(&hdr->daddr, &addr, &addr_type); + convert_dest_bdaddr(&ipv6_daddr, &addr, &addr_type); BT_DBG("dest addr %pMR type %d IP %pI6c", &addr, - addr_type, &hdr->daddr); + addr_type, &ipv6_daddr); read_lock_irqsave(&devices_lock, flags); peer = peer_lookup_ba(dev, &addr, addr_type); @@ -470,7 +465,7 @@ static int header_create(struct sk_buff *skb, struct net_device *netdev, * the destination address. */ read_lock_irqsave(&devices_lock, flags); - peer = peer_lookup_dst(dev, &hdr->daddr, skb); + peer = peer_lookup_dst(dev, &ipv6_daddr, skb); read_unlock_irqrestore(&devices_lock, flags); if (!peer) { BT_DBG("no such peer %pMR found", &addr); @@ -479,29 +474,56 @@ static int header_create(struct sk_buff *skb, struct net_device *netdev, } daddr = peer->eui64_addr; - - memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, - sizeof(struct in6_addr)); + *peer_addr = addr; + *peer_addr_type = addr_type; lowpan_cb(skb)->chan = peer->chan; + + status = 1; } - saddr = dev->netdev->dev_addr; + lowpan_header_compress(skb, netdev, ETH_P_IPV6, daddr, + dev->netdev->dev_addr, skb->len); + + err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); + if (err < 0) + return err; + + return status; +} + +static int header_create(struct sk_buff *skb, struct net_device *netdev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) +{ + struct ipv6hdr *hdr; + + if (type != ETH_P_IPV6) + return -EINVAL; + + hdr = ipv6_hdr(skb); + + memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, sizeof(struct in6_addr)); - return lowpan_header_compress(skb, netdev, type, daddr, saddr, len); + return 0; } /* Packet to BT LE device */ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, - struct net_device *netdev) + struct net_device *netdev, bool is_mcast) { struct msghdr msg; struct kvec iv; int err; /* Remember the skb so that we can send EAGAIN to the caller if - * we run out of credits. + * we run out of credits. This is not done for multicast packets + * because we generate mcast packet in this module and are not + * really able to remember the skb after this packet is sent. */ - chan->data = skb; + if (is_mcast) + chan->data = NULL; + else + chan->data = skb; memset(&msg, 0, sizeof(msg)); msg.msg_iov = (struct iovec *) &iv; @@ -549,7 +571,11 @@ static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) list_for_each_entry_safe(pentry, ptmp, &dev->peers, list) { local_skb = skb_clone(skb, GFP_ATOMIC); - send_pkt(pentry->chan, local_skb, netdev); + BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + netdev->name, + &pentry->chan->dst, pentry->chan->dst_type, + &pentry->peer_addr, pentry->chan); + send_pkt(pentry->chan, local_skb, netdev, true); kfree_skb(local_skb); } @@ -561,43 +587,48 @@ static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) { int err = 0; - struct lowpan_dev *dev; - struct lowpan_peer *peer; bdaddr_t addr; u8 addr_type; - if (ipv6_addr_is_multicast(&lowpan_cb(skb)->addr)) { - /* We need to send the packet to every device - * behind this interface. - */ - send_mcast_pkt(skb, netdev); - } else { - unsigned long flags; - - convert_dest_bdaddr(&lowpan_cb(skb)->addr, &addr, &addr_type); - dev = lowpan_dev(netdev); - - read_lock_irqsave(&devices_lock, flags); - peer = peer_lookup_ba(dev, &addr, addr_type); - if (!peer) - peer = peer_lookup_dst(dev, &lowpan_cb(skb)->addr, skb); - read_unlock_irqrestore(&devices_lock, flags); + /* We must take a copy of the skb before we modify/replace the ipv6 + * header as the header could be used elsewhere + */ + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; - BT_DBG("xmit %s to %pMR type %d IP %pI6c peer %p", - netdev->name, &addr, addr_type, - &lowpan_cb(skb)->addr, peer); + /* Return values from setup_header() + * <0 - error, packet is dropped + * 0 - this is a multicast packet + * 1 - this is unicast packet + */ + err = setup_header(skb, netdev, &addr, &addr_type); + if (err < 0) { + kfree_skb(skb); + return NET_XMIT_DROP; + } - if (peer && peer->chan) - err = send_pkt(peer->chan, skb, netdev); - else + if (err) { + if (lowpan_cb(skb)->chan) { + BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + netdev->name, &addr, addr_type, + &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); + err = send_pkt(lowpan_cb(skb)->chan, skb, netdev, + false); + } else { err = -ENOENT; + } + } else { + /* We need to send the packet to every device behind this + * interface. + */ + send_mcast_pkt(skb, netdev); } - dev_kfree_skb(skb); if (err) BT_DBG("ERROR: xmit failed (%d)", err); - return (err < 0) ? NET_XMIT_DROP : err; + return err < 0 ? NET_XMIT_DROP : err; } static const struct net_device_ops netdev_ops = { -- cgit v1.2.3 From 156395c9989a76228e0da40e71267a3d4fb07419 Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 29 Sep 2014 16:37:26 +0300 Subject: Bluetooth: 6lowpan: Enable multicast support Set multicast support for 6lowpan network interface. This is needed in every network interface that supports IPv6. Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 2ec7c84c2000..f0432aea8dad 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -648,7 +648,8 @@ static void netdev_setup(struct net_device *dev) dev->needed_tailroom = 0; dev->mtu = IPV6_MIN_MTU; dev->tx_queue_len = 0; - dev->flags = IFF_RUNNING | IFF_POINTOPOINT; + dev->flags = IFF_RUNNING | IFF_POINTOPOINT | + IFF_MULTICAST; dev->watchdog_timeo = 0; dev->netdev_ops = &netdev_ops; -- cgit v1.2.3 From b1937227316417aa7568d01e6fa1f272e98fb890 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2014 22:18:47 -0700 Subject: net: reorganize sk_buff for faster __copy_skb_header() With proliferation of bit fields in sk_buff, __copy_skb_header() became quite expensive, showing as the most expensive function in a GSO workload. __copy_skb_header() performance is also critical for non GSO TCP operations, as it is used from skb_clone() This patch carefully moves all the fields that were not copied in a separate zone : cloned, nohdr, fclone, peeked, head_frag, xmit_more Then I moved all other fields and all other copied fields in a section delimited by headers_start[0]/headers_end[0] section so that we can use a single memcpy() call, inlined by compiler using long word load/stores. I also tried to make all copies in the natural orders of sk_buff, to help hardware prefetching. I made sure sk_buff size did not change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 133 ++++++++++++++++++++++++++----------------------- net/core/skbuff.c | 80 ++++++++++++++--------------- 2 files changed, 113 insertions(+), 100 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8eaa62400fca..b6cced304b26 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -527,27 +527,41 @@ struct sk_buff { char cb[48] __aligned(8); unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_XFRM struct sec_path *sp; +#endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct nf_conntrack *nfct; +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + struct nf_bridge_info *nf_bridge; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; - union { - __wsum csum; - struct { - __u16 csum_start; - __u16 csum_offset; - }; - }; - __u32 priority; + + /* Following fields are _not_ copied in __copy_skb_header() + * Note that queue_mapping is here mostly to fill a hole. + */ kmemcheck_bitfield_begin(flags1); - __u8 ignore_df:1, - cloned:1, - ip_summed:2, + __u16 queue_mapping; + __u8 cloned:1, nohdr:1, - nfctinfo:3; + fclone:2, + peeked:1, + head_frag:1, + xmit_more:1; + /* one bit hole */ + kmemcheck_bitfield_end(flags1); + + + + /* fields enclosed in headers_start/headers_end are copied + * using a single memcpy() in __copy_skb_header() + */ + __u32 headers_start[0]; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD @@ -558,58 +572,53 @@ struct sk_buff { #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) __u8 __pkt_type_offset[0]; - __u8 pkt_type:3, - fclone:2, - ipvs_property:1, - peeked:1, - nf_trace:1; - kmemcheck_bitfield_end(flags1); - __be16 protocol; - - void (*destructor)(struct sk_buff *skb); -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - struct nf_conntrack *nfct; -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - struct nf_bridge_info *nf_bridge; -#endif - - int skb_iif; - - __u32 hash; - - __be16 vlan_proto; - __u16 vlan_tci; - -#ifdef CONFIG_NET_SCHED - __u16 tc_index; /* traffic control index */ -#ifdef CONFIG_NET_CLS_ACT - __u16 tc_verd; /* traffic control verdict */ -#endif -#endif - - __u16 queue_mapping; - kmemcheck_bitfield_begin(flags2); - __u8 xmit_more:1; -#ifdef CONFIG_IPV6_NDISC_NODETYPE - __u8 ndisc_nodetype:2; -#endif + __u8 pkt_type:3; __u8 pfmemalloc:1; + __u8 ignore_df:1; + __u8 nfctinfo:3; + + __u8 nf_trace:1; + __u8 ip_summed:2; __u8 ooo_okay:1; __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; + __u8 no_fcs:1; - __u8 head_frag:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; - /* 1/3 bit hole (depending on ndisc_nodetype presence) */ - kmemcheck_bitfield_end(flags2); + __u8 csum_level:2; + __u8 csum_bad:1; +#ifdef CONFIG_IPV6_NDISC_NODETYPE + __u8 ndisc_nodetype:2; +#endif + __u8 ipvs_property:1; + /* 5 or 7 bit hole */ + +#ifdef CONFIG_NET_SCHED + __u16 tc_index; /* traffic control index */ +#ifdef CONFIG_NET_CLS_ACT + __u16 tc_verd; /* traffic control verdict */ +#endif +#endif + + union { + __wsum csum; + struct { + __u16 csum_start; + __u16 csum_offset; + }; + }; + __u32 priority; + int skb_iif; + __u32 hash; + __be16 vlan_proto; + __u16 vlan_tci; #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL union { unsigned int napi_id; @@ -625,19 +634,18 @@ struct sk_buff { __u32 reserved_tailroom; }; - kmemcheck_bitfield_begin(flags3); - __u8 csum_level:2; - __u8 csum_bad:1; - /* 13 bit hole */ - kmemcheck_bitfield_end(flags3); - __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; + + __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; + + __u32 headers_end[0]; + /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; @@ -3040,19 +3048,22 @@ static inline void nf_reset_trace(struct sk_buff *skb) } /* Note: This doesn't put any conntrack and bridge info in dst. */ -static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) +static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, + bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->nfct = src->nfct; nf_conntrack_get(src->nfct); - dst->nfctinfo = src->nfctinfo; + if (copy) + dst->nfctinfo = src->nfctinfo; #endif #ifdef CONFIG_BRIDGE_NETFILTER dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) - dst->nf_trace = src->nf_trace; + if (copy) + dst->nf_trace = src->nf_trace; #endif } @@ -3064,7 +3075,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(dst->nf_bridge); #endif - __nf_copy(dst, src); + __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d4fdc649112c..4be570a4ab21 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -261,7 +261,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_t *fclone_ref = (atomic_t *) (child + 1); kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); skb->fclone = SKB_FCLONE_ORIG; atomic_set(fclone_ref, 1); @@ -675,57 +674,61 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +/* Make sure a field is enclosed inside headers_start/headers_end section */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ + offsetof(struct sk_buff, headers_start)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ + offsetof(struct sk_buff, headers_end)); \ + static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_protocol = old->inner_protocol; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - skb_copy_hash(new, old); - new->ooo_okay = old->ooo_okay; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; - new->encap_hdr_csum = old->encap_hdr_csum; - new->csum_valid = old->csum_valid; - new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->ignore_df = old->ignore_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; + __nf_copy(new, old, false); + + /* Note : this field could be in headers_start/headers_end section + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers_start, &old->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; + CHECK_SKB_FIELD(tc_index); #ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; + CHECK_SKB_FIELD(tc_verd); #endif #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; - - skb_copy_secmark(new, old); -#ifdef CONFIG_NET_RX_BUSY_POLL - new->napi_id = old->napi_id; -#endif } /* @@ -876,7 +879,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } -- cgit v1.2.3 From 41c91996d99394a75912aa5bfda300b85789ed43 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 29 Sep 2014 15:04:37 +0800 Subject: tcp: remove unnecessary assignment. This variable i is overwritten to 0 by following code Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9771563ab564..815c85e3b1e0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -115,7 +115,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { struct in6_addr *buf = (struct in6_addr *) tmp.val; - int i = 4; + int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; -- cgit v1.2.3 From d82bd1229885d550d03926cfa937703f6caa3cc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Sep 2014 13:08:29 +0200 Subject: tcp: move TCP_ECN_create_request out of header After Octavian Purdilas tcp ipv4/ipv6 unification work this helper only has a single callsite. While at it, convert name to lowercase, suggested by Stephen. Suggested-by: Stephen Hemminger Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 34 ---------------------------------- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1f57c5363492..545a79aa4212 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -861,40 +861,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a - * network is playing bad games with TOS bits. In order to - * avoid possible false congestion notifications, we disable - * TCP ECN negociation. - * - * Exception: tcp_ca wants ECN. This is required for DCTCP - * congestion control; it requires setting ECT on all packets, - * including SYN. We inverse the test in this case: If our - * local socket wants ECN, but peer only set ece/cwr (but not - * ECT in IP header) its probably a non-DCTCP aware sender. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) -{ - const struct tcphdr *th = tcp_hdr(skb); - const struct net *net = sock_net(listen_sk); - bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; - - if (!th_ecn) - return; - - ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - need_ecn = tcp_ca_needs_ecn(listen_sk); - - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) - inet_rsk(req)->ecn_ok = 1; - else if (ect && need_ecn) - inet_rsk(req)->ecn_ok = 1; -} - /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fc133178c787..174181e28ef3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5906,6 +5906,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) #endif } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static void tcp_ecn_create_request(struct request_sock *req, + const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -5966,7 +6000,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sk); + tcp_ecn_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); -- cgit v1.2.3 From 735d383117e113403442d971b23e7cfa2f876c7c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Sep 2014 13:08:30 +0200 Subject: tcp: change TCP_ECN prefixes to lower case Suggested by Stephen. Also drop inline keyword and let compiler decide. gcc 4.7.3 decides to no longer inline tcp_ecn_check_ce, so split it up. The actual evaluation is not inlined anymore while the ECN_OK test is. Suggested-by: Stephen Hemminger Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 41 ++++++++++++++++++++++------------------- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv4/tcp_output.c | 18 +++++++++--------- 3 files changed, 34 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 174181e28ef3..aa38f98b7884 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -201,28 +201,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } -static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { - if (!(tp->ecn_flags & TCP_ECN_OK)) - return; - switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -251,19 +248,25 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s } } -static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +{ + if (tp->ecn_flags & TCP_ECN_OK) + __tcp_ecn_check_ce(tp, skb); +} + +static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) +static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return true; @@ -660,7 +663,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } icsk->icsk_ack.lrcvtime = now; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); @@ -1976,7 +1979,7 @@ void tcp_enter_loss(struct sock *sk) sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); + tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on @@ -2368,7 +2371,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; - TCP_ECN_withdraw_cwr(tp); + tcp_ecn_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); @@ -2498,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tp->prr_delivered = 0; tp->prr_out = 0; tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); + tcp_ecn_queue_cwr(tp); } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, @@ -3453,7 +3456,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; ack_ev_flags |= CA_ACK_ECE; } @@ -4193,7 +4196,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) struct sk_buff *skb1; u32 seq, end_seq; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -4376,7 +4379,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - TCP_ECN_accept_cwr(tp, skb); + tcp_ecn_accept_cwr(tp, skb); tp->rx_opt.dsack = 0; @@ -5457,7 +5460,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * state to ESTABLISHED..." */ - TCP_ECN_rcv_synack(tp, th); + tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -5576,7 +5579,7 @@ discard: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - TCP_ECN_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 47b73506b77e..63d2680b65db 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -393,8 +393,8 @@ void tcp_openreq_init_rwin(struct request_sock *req, } EXPORT_SYMBOL(tcp_openreq_init_rwin); -static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, - struct request_sock *req) +static void tcp_ecn_openreq_child(struct tcp_sock *tp, + const struct request_sock *req) { tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } @@ -507,7 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - TCP_ECN_openreq_child(newtp, req); + tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 86a0216fcaa1..ee567e9e98c3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,7 +318,7 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) +static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -330,7 +330,7 @@ static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) } /* Packet ECN state for a SYN. */ -static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -344,8 +344,8 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) } } -static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, +static void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, struct sock *sk) { if (inet_rsk(req)->ecn_ok) { @@ -358,7 +358,7 @@ TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, +static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); @@ -960,7 +960,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - TCP_ECN_send(sk, skb, tcp_header_size); + tcp_ecn_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ @@ -2800,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(sk, skb); + tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2859,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -3098,7 +3098,7 @@ int tcp_connect(struct sock *sk) tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); - TCP_ECN_send_syn(sk, buff); + tcp_ecn_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : -- cgit v1.2.3 From 22e0f8b9322cb1a48b1357e8f4ae6f5a9eca8cfa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:52:56 -0700 Subject: net: sched: make bstats per cpu and estimator RCU safe In order to run qdisc's without locking statistics and estimators need to be handled correctly. To resolve bstats make the statistics per cpu. And because this is only needed for qdiscs that are running without locks which is not the case for most qdiscs in the near future only create percpu stats when qdiscs set the TCQ_F_CPUSTATS flag. Next because estimators use the bstats to calculate packets per second and bytes per second the estimator code paths are updated to use the per cpu statistics. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 11 ++++++++++ include/net/sch_generic.h | 22 ++++++++++++++++++- net/core/gen_estimator.c | 29 +++++++++++++++---------- net/core/gen_stats.c | 53 +++++++++++++++++++++++++++++++++++++++++----- net/netfilter/xt_RATEEST.c | 2 +- net/sched/act_api.c | 5 +++-- net/sched/act_police.c | 2 +- net/sched/sch_api.c | 29 +++++++++++++++++++------ net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 7 +++--- net/sched/sch_drr.c | 7 +++--- net/sched/sch_generic.c | 3 +++ net/sched/sch_hfsc.c | 13 +++++++----- net/sched/sch_htb.c | 12 +++++++---- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 4 ++-- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 8 ++++--- 19 files changed, 164 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index ea4271dceff0..ce3c1281f2a0 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -6,6 +6,11 @@ #include #include +struct gnet_stats_basic_cpu { + struct gnet_stats_basic_packed bstats; + struct u64_stats_sync syncp; +}; + struct gnet_dump { spinlock_t * lock; struct sk_buff * skb; @@ -27,7 +32,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d); int gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); +void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, struct gnet_stats_rate_est64 *r); @@ -37,11 +46,13 @@ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est64 *rate_est); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e65b8e0752af..4b9351120fd8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,7 @@ struct Qdisc { * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) +#define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -83,7 +85,10 @@ struct Qdisc { */ unsigned long state; struct sk_buff_head q; - struct gnet_stats_basic_packed bstats; + union { + struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + } __packed; unsigned int __state; struct gnet_stats_queue qstats; struct rcu_head rcu_head; @@ -487,6 +492,10 @@ static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue(skb, sch) & NET_XMIT_MASK; } +static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) +{ + return q->flags & TCQ_F_CPUSTATS; +} static inline void bstats_update(struct gnet_stats_basic_packed *bstats, const struct sk_buff *skb) @@ -495,6 +504,17 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; } +static inline void qdisc_bstats_update_cpu(struct Qdisc *sch, + const struct sk_buff *skb) +{ + struct gnet_stats_basic_cpu *bstats = + this_cpu_ptr(sch->cpu_bstats); + + u64_stats_update_begin(&bstats->syncp); + bstats_update(&bstats->bstats, skb); + u64_stats_update_end(&bstats->syncp); +} + static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9d33dfffca19..9dfb88a933e7 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -91,6 +91,8 @@ struct gen_estimator u32 avpps; struct rcu_head e_rcu; struct rb_node node; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct rcu_head head; }; struct gen_estimator_head @@ -115,9 +117,8 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { - u64 nbytes; + struct gnet_stats_basic_packed b = {0}; u64 brate; - u32 npackets; u32 rate; spin_lock(e->stats_lock); @@ -125,15 +126,15 @@ static void est_timer(unsigned long arg) if (e->bstats == NULL) goto skip; - nbytes = e->bstats->bytes; - npackets = e->bstats->packets; - brate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; + __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + + brate = (b.bytes - e->last_bytes)<<(7 - idx); + e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; + rate = (b.packets - e->last_packets)<<(12 - idx); + e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); e->rate_est->pps = (e->avpps+0x1FF)>>10; skip: @@ -203,12 +204,14 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); + struct gnet_stats_basic_packed b = {0}; int idx; if (nla_len(opt) < sizeof(*parm)) @@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; + __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; est->ewma_log = parm->ewma_log; - est->last_bytes = bstats->bytes; + est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; - est->last_packets = bstats->packets; + est->last_packets = b.packets; est->avpps = rate_est->pps<<10; + est->cpu_bstats = cpu_bstats; spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { @@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 2ddbce4cce14..5ff8e80fe0bb 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); +static void +__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu) +{ + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + __u64 bytes; + __u32 packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = bcpu->bstats.bytes; + packets = bcpu->bstats.packets; + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + bstats->bytes += bcpu->bstats.bytes; + bstats->packets += bcpu->bstats.packets; + } +} + +void +__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + if (cpu) { + __gnet_stats_copy_basic_cpu(bstats, cpu); + } else { + bstats->bytes = b->bytes; + bstats->packets = b->packets; + } +} +EXPORT_SYMBOL(__gnet_stats_copy_basic); + /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle @@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) { + struct gnet_stats_basic_packed bstats = {0}; + + __gnet_stats_copy_basic(&bstats, cpu, b); + if (d->compat_tc_stats) { - d->tc_stats.bytes = b->bytes; - d->tc_stats.packets = b->packets; + d->tc_stats.bytes = bstats.bytes; + d->tc_stats.packets = bstats.packets; } if (d->tail) { struct gnet_stats_basic sb; memset(&sb, 0, sizeof(sb)); - sb.bytes = b->bytes; - sb.packets = b->packets; + sb.bytes = bstats.bytes; + sb.packets = bstats.packets; return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); } return 0; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 370adf622cef..604df6fae6fc 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -136,7 +136,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; - ret = gen_new_estimator(&est->bstats, &est->rstats, + ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, &est->lock, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 648778aef1a2..eca4cf9ece2f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -252,7 +252,8 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est, + int err = gen_new_estimator(&p->tcfc_bstats, NULL, + &p->tcfc_rate_est, &p->tcfc_lock, est); if (err) { kfree(p); @@ -619,7 +620,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index f32bcb094915..69791ca77a05 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -178,7 +178,7 @@ override: spin_lock_bh(&police->tcf_lock); if (est) { - err = gen_replace_estimator(&police->tcf_bstats, + err = gen_replace_estimator(&police->tcf_bstats, NULL, &police->tcf_rate_est, &police->tcf_lock, est); if (err) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 15e7beee266c..a95e3b48fa51 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -942,6 +942,13 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { + if (qdisc_is_percpu_stats(sch)) { + sch->cpu_bstats = + alloc_percpu(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto err_out4; + } + if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -964,8 +971,11 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, else root_lock = qdisc_lock(sch); - err = gen_new_estimator(&sch->bstats, &sch->rate_est, - root_lock, tca[TCA_RATE]); + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + root_lock, + tca[TCA_RATE]); if (err) goto err_out4; } @@ -984,6 +994,7 @@ err_out: return NULL; err_out4: + free_percpu(sch->cpu_bstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. @@ -1022,9 +1033,11 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) because change can't be undone. */ if (sch->flags & TCQ_F_MQROOT) goto out; - gen_replace_estimator(&sch->bstats, &sch->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE]); + gen_replace_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); } out: return 0; @@ -1299,6 +1312,7 @@ graft: static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); @@ -1334,7 +1348,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; - if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || + if (qdisc_is_percpu_stats(q)) + cpu_bstats = q->cpu_bstats; + + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, &q->qstats) < 0) goto nla_put_failure; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c398f9c3dbdd..01017663e5d8 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -639,7 +639,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, flow->qstats.qlen = flow->q->q.qlen; - if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || gnet_stats_copy_queue(d, &flow->qstats) < 0) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index d2cd981ba60d..22a3a029a911 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1601,7 +1601,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1759,7 +1759,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t } if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1852,7 +1853,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t goto failure; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index d8b5ccfd248a..7a6243c5d270 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -88,7 +88,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -116,7 +117,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -282,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; } - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) return -1; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 11b28f651ad1..7c8e5d73d433 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -632,6 +632,9 @@ static void qdisc_rcu_free(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); + if (qdisc_is_percpu_stats(qdisc)) + free_percpu(qdisc->cpu_bstats); + kfree((char *) qdisc - qdisc->padded); } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 04b0de4c68b5..209b966b2eed 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1014,9 +1014,12 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cur_time = psched_get_time(); if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE]); + spinlock_t *lock = qdisc_root_sleeping_lock(sch); + + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + lock, + tca[TCA_RATE]); if (err) return err; } @@ -1063,7 +1066,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1374,7 +1377,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 063e953d9848..0256dee69bd6 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1144,7 +1144,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1402,7 +1402,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; if (htb_rate_est || tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE] ? : &est.nla); if (err) { @@ -1464,8 +1465,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent->children++; } else { if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + spinlock_t *lock = qdisc_root_sleeping_lock(sch); + + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + lock, tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index a8b2864a696b..d3a27fb607af 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -201,7 +201,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; sch->qstats.qlen = sch->q.qlen; - if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, &sch->qstats) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 37e7d25d21f1..8917372fddc6 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -355,7 +355,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, } /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(d, &bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || gnet_stats_copy_queue(d, &qstats) < 0) return -1; } else { @@ -363,7 +363,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; sch->qstats.qlen = sch->q.qlen; - if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, &sch->qstats) < 0) return -1; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index c0466c1840f3..4adbf7fefc09 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -361,7 +361,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; cl_q->qstats.qlen = cl_q->q.qlen; - if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, &cl_q->qstats) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 03ef99e52a5c..68a8f25e30c3 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -325,7 +325,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; cl_q->qstats.qlen = cl_q->q.qlen; - if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, &cl_q->qstats) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 602ea01a4ddd..d59f8574540a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -459,7 +459,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -484,7 +485,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -667,7 +669,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) return -1; -- cgit v1.2.3 From 25331d6ce42bcf4b34b6705fce4da15c3fabe62f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:53:29 -0700 Subject: net: sched: implement qstat helper routines This adds helpers to manipulate qstats logic and replaces locations that touch the counters directly. This simplifies future patches to push qstats onto per cpu counters. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 39 +++++++++++++++++++++++++++++++++------ net/sched/sch_api.c | 2 +- net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 10 +++++----- net/sched/sch_choke.c | 14 +++++++------- net/sched/sch_codel.c | 2 +- net/sched/sch_drr.c | 4 ++-- net/sched/sch_dsmark.c | 2 +- net/sched/sch_fifo.c | 2 +- net/sched/sch_fq.c | 4 ++-- net/sched/sch_fq_codel.c | 8 ++++---- net/sched/sch_gred.c | 4 ++-- net/sched/sch_hfsc.c | 8 ++++---- net/sched/sch_hhf.c | 8 ++++---- net/sched/sch_htb.c | 6 +++--- net/sched/sch_ingress.c | 2 +- net/sched/sch_multiq.c | 4 ++-- net/sched/sch_netem.c | 15 +++++++-------- net/sched/sch_pie.c | 2 +- net/sched/sch_prio.c | 4 ++-- net/sched/sch_qfq.c | 4 ++-- net/sched/sch_red.c | 8 ++++---- net/sched/sch_sfb.c | 10 +++++----- net/sched/sch_sfq.c | 17 +++++++++-------- net/sched/sch_tbf.c | 8 ++++---- 25 files changed, 108 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4b9351120fd8..23a0f0fc83d8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -521,11 +521,38 @@ static inline void qdisc_bstats_update(struct Qdisc *sch, bstats_update(&sch->bstats, skb); } +static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, + const struct sk_buff *skb) +{ + sch->qstats.backlog -= qdisc_pkt_len(skb); +} + +static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, + const struct sk_buff *skb) +{ + sch->qstats.backlog += qdisc_pkt_len(skb); +} + +static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) +{ + sch->qstats.drops += count; +} + +static inline void qdisc_qstats_drop(struct Qdisc *sch) +{ + sch->qstats.drops++; +} + +static inline void qdisc_qstats_overlimit(struct Qdisc *sch) +{ + sch->qstats.overlimits++; +} + static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff_head *list) { __skb_queue_tail(list, skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } @@ -541,7 +568,7 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch, struct sk_buff *skb = __skb_dequeue(list); if (likely(skb != NULL)) { - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); } @@ -560,7 +587,7 @@ static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); - sch->qstats.backlog -= len; + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); return len; } @@ -579,7 +606,7 @@ static inline struct sk_buff *__qdisc_dequeue_tail(struct Qdisc *sch, struct sk_buff *skb = __skb_dequeue_tail(list); if (likely(skb != NULL)) - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); return skb; } @@ -661,14 +688,14 @@ static inline unsigned int qdisc_queue_drop(struct Qdisc *sch) static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { kfree_skb(skb); - sch->qstats.drops++; + qdisc_qstats_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_reshape_fail(struct sk_buff *skb, struct Qdisc *sch) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); #ifdef CONFIG_NET_CLS_ACT if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a95e3b48fa51..2862bc61a358 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -763,7 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; - sch->qstats.drops += drops; + __qdisc_qstats_drop(sch, drops); } } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 01017663e5d8..040212cab988 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -417,7 +417,7 @@ done: if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); if (flow) flow->qstats.drops++; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 22a3a029a911..60432c3d3cd4 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -377,7 +377,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) #endif if (cl == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -395,7 +395,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); cbq_mark_toplevel(q, cl); cl->qstats.drops++; } @@ -650,11 +650,11 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) return 0; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return 0; } - sch->qstats.drops++; + qdisc_qstats_drop(sch); return -1; } #endif @@ -995,7 +995,7 @@ cbq_dequeue(struct Qdisc *sch) */ if (sch->q.qlen) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (q->wd_expires) qdisc_watchdog_schedule(&q->watchdog, now + q->wd_expires); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 8abc2625c3a1..c009eb9045ce 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -127,7 +127,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) if (idx == q->tail) choke_zap_tail_holes(q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); qdisc_tree_decrease_qlen(sch, 1); --sch->q.qlen; @@ -302,7 +302,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->vars.qavg > p->qth_max) { q->vars.qcount = -1; - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (use_harddrop(q) || !use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; @@ -315,7 +315,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->vars.qcount = 0; q->vars.qR = red_random(p); - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -332,7 +332,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->tab[q->tail] = skb; q->tail = (q->tail + 1) & q->tab_mask; ++sch->q.qlen; - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } @@ -345,7 +345,7 @@ congestion_drop: other_drop: if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -365,7 +365,7 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) q->tab[q->head] = NULL; choke_zap_head_holes(q); --sch->q.qlen; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; @@ -460,7 +460,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ntab[tail++] = skb; continue; } - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; qdisc_drop(skb, sch); } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 2f9ab17db85a..de28f8e968e8 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -149,7 +149,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 7a6243c5d270..907b12fd6825 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -360,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = drr_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -369,7 +369,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 485e456c8139..227114f27f94 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -258,7 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) err = qdisc_enqueue(skb, p->q); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return err; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index e15a9eb29087..2e2398cfc694 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* queue full, remove one skb to fulfill the limit */ __qdisc_queue_drop_head(sch, &sch->q); - sch->qstats.drops++; + qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); return NET_XMIT_CN; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index e12f997e1b4c..c9b9fcb53206 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -290,7 +290,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) flow->head = skb->next; skb->next = NULL; flow->qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } return skb; @@ -371,7 +371,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) f->qlen++; if (skb_is_retransmit(skb)) q->stat_tcp_retrans++; - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 105cf5557630..9270e1b2f25d 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -164,8 +164,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) q->backlogs[idx] -= len; kfree_skb(skb); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= len; + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); flow->dropped++; return idx; } @@ -180,7 +180,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -190,7 +190,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow = &q->flows[idx]; flow_queue_add(flow, skb); q->backlogs[idx] += qdisc_pkt_len(skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 12cbc09157fc..a4ca4517cdc8 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -209,7 +209,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -219,7 +219,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (gred_use_harddrop(t) || !gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 209b966b2eed..ad278251d811 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1591,7 +1591,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -1600,7 +1600,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } @@ -1643,7 +1643,7 @@ hfsc_dequeue(struct Qdisc *sch) */ cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); hfsc_schedule_watchdog(sch); return NULL; } @@ -1698,7 +1698,7 @@ hfsc_drop(struct Qdisc *sch) list_move_tail(&cl->dlist, &q->droplist); } cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); sch->q.qlen--; return len; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d85b6812a7d4..15d3aabfe250 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -376,8 +376,8 @@ static unsigned int hhf_drop(struct Qdisc *sch) struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); } @@ -395,7 +395,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) bucket = &q->buckets[idx]; bucket_add(bucket, skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; @@ -457,7 +457,7 @@ begin: if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 0256dee69bd6..c40ab7a98c50 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -586,13 +586,13 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; #endif } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); cl->qstats.drops++; } return ret; @@ -925,7 +925,7 @@ ok: goto ok; } } - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) { if (!test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index b351125f3849..eb5b8445fef9 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -69,7 +69,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) switch (result) { case TC_ACT_SHOT: result = TC_ACT_SHOT; - sch->qstats.drops++; + qdisc_qstats_drop(sch); break; case TC_ACT_STOLEN: case TC_ACT_QUEUED: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 4adbf7fefc09..7f4e1d8504b0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -75,7 +75,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -87,7 +87,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 111d70fddaea..b34331967e02 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,12 +429,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Drop packet? */ if (loss_event(q)) { if (q->ecn && INET_ECN_set_ce(skb)) - sch->qstats.drops++; /* mark packet */ + qdisc_qstats_drop(sch); /* mark packet */ else --count; } if (count == 0) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -478,7 +478,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) return qdisc_reshape_fail(skb, sch); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ @@ -549,15 +549,14 @@ static unsigned int netem_drop(struct Qdisc *sch) sch->q.qlen--; skb->next = NULL; skb->prev = NULL; - len = qdisc_pkt_len(skb); - sch->qstats.backlog -= len; + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); } } if (!len && q->qdisc && q->qdisc->ops->drop) len = q->qdisc->ops->drop(q->qdisc); if (len) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return len; } @@ -575,7 +574,7 @@ tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { deliver: - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; @@ -610,7 +609,7 @@ deliver: if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); qdisc_tree_decrease_qlen(sch, 1); } } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index fefeeb73f15f..33d7a98a7a97 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -232,7 +232,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 68a8f25e30c3..b411e78a02fc 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -77,7 +77,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -89,7 +89,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d59f8574540a..3fb26555c79b 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1229,7 +1229,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = qfq_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -1249,7 +1249,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 633e32defdcc..6c0534cc7758 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -74,7 +74,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -84,7 +84,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; @@ -100,7 +100,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return ret; @@ -142,7 +142,7 @@ static unsigned int red_drop(struct Qdisc *sch) if (child->ops->drop && (len = child->ops->drop(child)) > 0) { q->stats.other++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); sch->q.qlen--; return len; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1562fb2b3f46..5819dd82630d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -290,7 +290,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.queuedrop++; goto drop; } @@ -348,7 +348,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) sfb_skb_cb(skb)->hashes[slot] = 0; if (unlikely(minqlen >= q->max)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.bucketdrop++; goto drop; } @@ -376,7 +376,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } if (sfb_rate_limit(skb, q)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.penaltydrop++; goto drop; } @@ -411,7 +411,7 @@ enqueue: increment_qlen(skb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return ret; @@ -420,7 +420,7 @@ drop: return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 80c36bd54abc..158dfa641d18 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -331,8 +331,8 @@ drop: sfq_dec(q, x); kfree_skb(skb); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= len; + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); return len; } @@ -379,7 +379,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) hash = sfq_classify(skb, sch, &ret); if (hash == 0) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -409,7 +409,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (sfq_prob_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && @@ -426,7 +426,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto congestion_drop; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (sfq_hard_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && @@ -461,7 +461,7 @@ congestion_drop: } enqueue: - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); @@ -520,7 +520,7 @@ next_slot: sfq_dec(q, a); qdisc_bstats_update(sch, skb); sch->q.qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { @@ -586,7 +586,8 @@ static void sfq_rehash(struct Qdisc *sch) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) { -drop: sch->qstats.backlog -= qdisc_pkt_len(skb); +drop: + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); dropped++; continue; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 0c39b754083b..77edffe329c4 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -175,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(segs, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); } else { nb++; } @@ -201,7 +201,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } @@ -216,7 +216,7 @@ static unsigned int tbf_drop(struct Qdisc *sch) if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { sch->q.qlen--; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return len; } @@ -281,7 +281,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) (cf. CSZ, HPFQ, HFSC) */ - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); } return NULL; } -- cgit v1.2.3 From 6401585366326fc0ecbc372ec60d1a15cd8be2f5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:53:57 -0700 Subject: net: sched: restrict use of qstats qlen This removes the use of qstats->qlen variable from the classifiers and makes it an explicit argument to gnet_stats_copy_queue(). The qlen represents the qdisc queue length and is packed into the qstats at the last moment before passnig to user space. By handling it explicitely we avoid, in the percpu stats case, having to figure out which per_cpu variable to put it in. It would probably be best to remove it from qstats completely but qstats is a user space ABI and can't be broken. A future patch could make an internal only qstats structure that would avoid having to allocate an additional u32 variable on the Qdisc struct. This would make the qstats struct 128bits instead of 128+32. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 3 ++- net/core/gen_stats.c | 6 +++++- net/sched/act_api.c | 4 +++- net/sched/sch_api.c | 5 +++-- net/sched/sch_atm.c | 4 +--- net/sched/sch_cbq.c | 3 +-- net/sched/sch_drr.c | 7 +++---- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 3 +-- net/sched/sch_htb.c | 5 +++-- net/sched/sch_mq.c | 4 +--- net/sched/sch_mqprio.c | 9 ++++----- net/sched/sch_multiq.c | 3 +-- net/sched/sch_prio.c | 3 +-- net/sched/sch_qfq.c | 3 +-- net/sched/sch_sfq.c | 2 +- 16 files changed, 32 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index ce3c1281f2a0..de9b3dd5750e 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -40,7 +40,8 @@ void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, struct gnet_stats_rate_est64 *r); -int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q); +int gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue *q, __u32 len); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 5ff8e80fe0bb..ad3ecb6ba835 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -219,6 +219,7 @@ EXPORT_SYMBOL(gnet_stats_copy_rate_est); * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle * @q: queue statistics + * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -227,8 +228,11 @@ EXPORT_SYMBOL(gnet_stats_copy_rate_est); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue *q, __u32 qlen) { + q->qlen = qlen; + if (d->compat_tc_stats) { d->tc_stats.drops = q->drops; d->tc_stats.qlen = q->qlen; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eca4cf9ece2f..2e134093b8ec 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -623,7 +623,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) + gnet_stats_copy_queue(&d, + &p->tcfc_qstats, + p->tcfc_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2862bc61a358..ca00ea8e84dc 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1318,6 +1318,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; + __u32 qlen; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); @@ -1335,7 +1336,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - q->qstats.qlen = q->q.qlen; + qlen = q->q.qlen; stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) @@ -1353,7 +1354,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || - gnet_stats_copy_queue(&d, &q->qstats) < 0) + gnet_stats_copy_queue(&d, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 040212cab988..c145eb6279cc 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -637,10 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - flow->qstats.qlen = flow->q->q.qlen; - if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || - gnet_stats_copy_queue(d, &flow->qstats) < 0) + gnet_stats_copy_queue(d, &flow->qstats, flow->q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 60432c3d3cd4..c610081ffba5 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1594,7 +1594,6 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - cl->qstats.qlen = cl->q->q.qlen; cl->xstats.avgidle = cl->avgidle; cl->xstats.undertime = 0; @@ -1603,7 +1602,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qstats, cl->q->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 907b12fd6825..5835a93905b1 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -275,17 +275,16 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; + __u32 qlen = cl->qdisc->q.qlen; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); - if (cl->qdisc->q.qlen) { + if (qlen) xstats.deficit = cl->deficit; - cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; - } if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qdisc->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9270e1b2f25d..226d73597539 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -550,7 +550,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, &qs) < 0) + if (gnet_stats_copy_queue(d, &qs, 0) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ad278251d811..d364acb2ad07 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1370,7 +1370,6 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct hfsc_class *cl = (struct hfsc_class *)arg; struct tc_hfsc_stats xstats; - cl->qstats.qlen = cl->qdisc->q.qlen; cl->qstats.backlog = cl->qdisc->qstats.backlog; xstats.level = cl->level; xstats.period = cl->cl_vtperiod; @@ -1379,7 +1378,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c40ab7a98c50..3a691fd88f99 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1138,15 +1138,16 @@ static int htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct htb_class *cl = (struct htb_class *)arg; + __u32 qlen = 0; if (!cl->level && cl->un.leaf.q) - cl->qstats.qlen = cl->un.leaf.q->q.qlen; + qlen = cl->un.leaf.q->q.qlen; cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index d3a27fb607af..6416a6942062 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -112,7 +112,6 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -200,9 +199,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - sch->qstats.qlen = sch->q.qlen; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats) < 0) + gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8917372fddc6..03dbeb5e8181 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -236,7 +236,6 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -327,6 +326,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl <= netdev_get_num_tc(dev)) { int i; + __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; @@ -344,9 +344,9 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); + qlen += qdisc->q.qlen; bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; - qstats.qlen += qdisc->qstats.qlen; qstats.backlog += qdisc->qstats.backlog; qstats.drops += qdisc->qstats.drops; qstats.requeues += qdisc->qstats.requeues; @@ -356,15 +356,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || - gnet_stats_copy_queue(d, &qstats) < 0) + gnet_stats_copy_queue(d, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - sch->qstats.qlen = sch->q.qlen; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats) < 0) + gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) return -1; } return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 7f4e1d8504b0..53357b368bff 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -360,9 +360,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - cl_q->qstats.qlen = cl_q->q.qlen; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index b411e78a02fc..4644f55242d2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -324,9 +324,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - cl_q->qstats.qlen = cl_q->q.qlen; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3fb26555c79b..66df9d9e301a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -664,14 +664,13 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct tc_qfq_stats xstats; memset(&xstats, 0, sizeof(xstats)); - cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + gnet_stats_copy_queue(d, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 158dfa641d18..d4afcbc1c6f7 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -871,7 +871,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.qlen = slot->qlen; qs.backlog = slot->backlog; } - if (gnet_stats_copy_queue(d, &qs) < 0) + if (gnet_stats_copy_queue(d, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } -- cgit v1.2.3 From b0ab6f92752b9f9d8da980506e9df3bd9dcd7ed3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 28 Sep 2014 11:54:24 -0700 Subject: net: sched: enable per cpu qstats After previous patches to simplify qstats the qstats can be made per cpu with a packed union in Qdisc struct. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 3 ++- include/net/sch_generic.h | 12 ++++++++++- net/core/gen_stats.c | 55 +++++++++++++++++++++++++++++++++++++++++------ net/sched/act_api.c | 2 +- net/sched/sch_api.c | 12 +++++++++-- net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 5 +++-- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 3 ++- net/sched/sch_sfq.c | 2 +- 17 files changed, 87 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index de9b3dd5750e..cbafa3768d48 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -41,7 +41,8 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, struct gnet_stats_rate_est64 *r); int gnet_stats_copy_queue(struct gnet_dump *d, - struct gnet_stats_queue *q, __u32 len); + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 23a0f0fc83d8..f12669819d1a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -90,7 +90,10 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; } __packed; unsigned int __state; - struct gnet_stats_queue qstats; + union { + struct gnet_stats_queue qstats; + struct gnet_stats_queue __percpu *cpu_qstats; + } __packed; struct rcu_head rcu_head; int padded; atomic_t refcnt; @@ -543,6 +546,13 @@ static inline void qdisc_qstats_drop(struct Qdisc *sch) sch->qstats.drops++; } +static inline void qdisc_qstats_drop_cpu(struct Qdisc *sch) +{ + struct gnet_stats_queue *qstats = this_cpu_ptr(sch->cpu_qstats); + + qstats->drops++; +} + static inline void qdisc_qstats_overlimit(struct Qdisc *sch) { sch->qstats.overlimits++; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ad3ecb6ba835..14681b97a4f3 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -215,33 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); +static void +__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) +{ + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + + qstats->qlen = 0; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; + qstats->overlimits += qcpu->overlimits; + } +} + +static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) +{ + if (cpu) { + __gnet_stats_copy_queue_cpu(qstats, cpu); + } else { + qstats->qlen = q->qlen; + qstats->backlog = q->backlog; + qstats->drops = q->drops; + qstats->requeues = q->requeues; + qstats->overlimits = q->overlimits; + } + + qstats->qlen = qlen; +} + /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle + * @cpu_q: per cpu queue statistics * @q: queue statistics * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by - * gnet_stats_start_copy(). + * gnet_stats_start_copy(). Using per cpu queue statistics if + * they are available. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen) { - q->qlen = qlen; + struct gnet_stats_queue qstats = {0}; + + __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); if (d->compat_tc_stats) { - d->tc_stats.drops = q->drops; - d->tc_stats.qlen = q->qlen; - d->tc_stats.backlog = q->backlog; - d->tc_stats.overlimits = q->overlimits; + d->tc_stats.drops = qstats.drops; + d->tc_stats.qlen = qstats.qlen; + d->tc_stats.backlog = qstats.backlog; + d->tc_stats.overlimits = qstats.overlimits; } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + return gnet_stats_copy(d, TCA_STATS_QUEUE, + &qstats, sizeof(qstats)); return 0; } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2e134093b8ec..3d43e4979f27 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -623,7 +623,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, + gnet_stats_copy_queue(&d, NULL, &p->tcfc_qstats, p->tcfc_qstats.qlen) < 0) goto errout; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ca00ea8e84dc..aa8329508dba 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -947,6 +947,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, alloc_percpu(struct gnet_stats_basic_cpu); if (!sch->cpu_bstats) goto err_out4; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) + goto err_out4; } if (tca[TCA_STAB]) { @@ -995,6 +999,7 @@ err_out: err_out4: free_percpu(sch->cpu_bstats); + free_percpu(sch->cpu_qstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. @@ -1313,6 +1318,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); @@ -1349,12 +1355,14 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; - if (qdisc_is_percpu_stats(q)) + if (qdisc_is_percpu_stats(q)) { cpu_bstats = q->cpu_bstats; + cpu_qstats = q->cpu_qstats; + } if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || - gnet_stats_copy_queue(&d, &q->qstats, qlen) < 0) + gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c145eb6279cc..e3e2cc5fd068 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -638,7 +638,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct atm_flow_data *flow = (struct atm_flow_data *)arg; if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || - gnet_stats_copy_queue(d, &flow->qstats, flow->q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c610081ffba5..beeb75f80fdb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1602,7 +1602,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats, cl->q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 5835a93905b1..338706092c27 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -284,7 +284,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 226d73597539..b9ca32ebc1de 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -550,7 +550,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, &qs, 0) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d364acb2ad07..e6c7416d0332 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1378,7 +1378,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats, cl->qdisc->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 3a691fd88f99..f1acb0f60dc3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1147,7 +1147,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 6416a6942062..f3cbaecd283a 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -200,7 +200,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 03dbeb5e8181..3811a745452c 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -356,14 +356,15 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || - gnet_stats_copy_queue(d, &qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats, sch->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, + &sch->qstats, sch->q.qlen) < 0) return -1; } return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 53357b368bff..42dd218871e0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -361,7 +361,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 4644f55242d2..8e5cd34aaa74 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -325,7 +325,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats, cl_q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 66df9d9e301a..3ec7e88a43ca 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -670,7 +670,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, + &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index d4afcbc1c6f7..b877140beda5 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -871,7 +871,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.qlen = slot->qlen; qs.backlog = slot->backlog; } - if (gnet_stats_copy_queue(d, &qs, qs.qlen) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } -- cgit v1.2.3 From 57f5877c11b244ff2315f4ba0e57b54fe013581f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Sep 2014 10:59:18 +0200 Subject: netfilter: bridge: build br_nf_core only if required Eric reports build failure with CONFIG_BRIDGE_NETFILTER=n We insist to build br_nf_core.o unconditionally, but we must only do so if br_netfilter was enabled, else it fails to build due to functions being defined to empty stubs (and some structure members being defined out). Also, BRIDGE_NETFILTER=y|m makes no sense when BRIDGE=n. Fixes: 34666d467 (netfilter: bridge: move br_netfilter out of the core) Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/Kconfig | 2 +- net/bridge/Makefile | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index dc5d700d05e7..d6b138e2c263 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -177,7 +177,7 @@ config NETFILTER_ADVANCED config BRIDGE_NETFILTER tristate "Bridged IP/ARP packets filtering" - depends on (BRIDGE || BRIDGE=n) + depends on BRIDGE depends on NETFILTER && INET depends on NETFILTER_ADVANCED default m diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 5e3eac5dc8b9..fd7ee03c59b3 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,11 +6,12 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o br_netlink.o \ - br_nf_core.o + br_stp_if.o br_stp_timer.o br_netlink.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o +bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o + obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o -- cgit v1.2.3 From 0d4a2f9a33ea0d9964afca92b9b62e943c00b904 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 30 Sep 2014 22:24:04 +0200 Subject: irda: add __init to irlan_open irlan_open is only called by __init irlan_init in same module. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/irda/irlan/irlan_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index 1bc49edf2296..5a2d0a695529 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -98,7 +98,7 @@ static const struct file_operations irlan_fops = { extern struct proc_dir_entry *proc_irda; #endif /* CONFIG_PROC_FS */ -static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr); +static struct irlan_cb __init *irlan_open(__u32 saddr, __u32 daddr); static void __irlan_close(struct irlan_cb *self); static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, __u8 value_byte, __u16 value_short, @@ -196,7 +196,7 @@ static void __exit irlan_cleanup(void) * Open new instance of a client/provider, we should only register the * network device if this instance is ment for a particular client/provider */ -static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr) +static struct irlan_cb __init *irlan_open(__u32 saddr, __u32 daddr) { struct net_device *dev; struct irlan_cb *self; -- cgit v1.2.3 From 3243acd37fd9b7fc4b19318eddf28b418b3b060a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 30 Sep 2014 22:34:08 +0200 Subject: ieee802154: add __init to lowpan_frags_sysctl_register lowpan_frags_sysctl_register is only called by __init lowpan_net_frag_init (part of the lowpan module). Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ieee802154/reassembly.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index 32755cb7e64e..30ec608b351a 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -485,7 +485,7 @@ static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) static struct ctl_table_header *lowpan_ctl_header; -static int lowpan_frags_sysctl_register(void) +static int __init lowpan_frags_sysctl_register(void) { lowpan_ctl_header = register_net_sysctl(&init_net, "net/ieee802154/6lowpan", @@ -498,7 +498,7 @@ static void lowpan_frags_sysctl_unregister(void) unregister_net_sysctl_table(lowpan_ctl_header); } #else -static inline int lowpan_frags_ns_sysctl_register(struct net *net) +static inline int __init lowpan_frags_ns_sysctl_register(struct net *net) { return 0; } -- cgit v1.2.3 From a12a601ed163578084a48708ae376805f79a1ccf Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 30 Sep 2014 09:49:55 +0800 Subject: tcp: Change tcp_slow_start function to return void No caller uses the return value, so make this function return void. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_cong.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 545a79aa4212..ba2f9d03076b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -830,7 +830,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name); -int tcp_slow_start(struct tcp_sock *tp, u32 acked); +void tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); u32 tcp_reno_ssthresh(struct sock *sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a6c8a5775624..b1c5970d47a1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,15 +291,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -int tcp_slow_start(struct tcp_sock *tp, u32 acked) +void tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; - acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); -- cgit v1.2.3 From f0a0c1cedfe421b32d2aa39971c43f83f8e5fa42 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 1 Oct 2014 07:27:46 +0200 Subject: ieee802154: fix __init functions Commit 3243acd37fd9 ("ieee802154: add __init to lowpan_frags_sysctl_register") added __init to lowpan_frags_ns_sysctl_register instead of lowpan_frags_sysctl_register Suggested-by: Alexander Aring Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ieee802154/reassembly.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index 30ec608b351a..7cfcd6885225 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -498,7 +498,7 @@ static void lowpan_frags_sysctl_unregister(void) unregister_net_sysctl_table(lowpan_ctl_header); } #else -static inline int __init lowpan_frags_ns_sysctl_register(struct net *net) +static inline int lowpan_frags_ns_sysctl_register(struct net *net) { return 0; } @@ -507,7 +507,7 @@ static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) { } -static inline int lowpan_frags_sysctl_register(void) +static inline int __init lowpan_frags_sysctl_register(void) { return 0; } -- cgit v1.2.3 From 2c804d0f8fc7799981d9fdd8c88653541b28c1a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Sep 2014 22:12:05 -0700 Subject: ipv4: mentions skb_gro_postpull_rcsum() in inet_gro_receive() Proper CHECKSUM_COMPLETE support needs to adjust skb->csum when we remove one header. Its done using skb_gro_postpull_rcsum() In the case of IPv4, we know that the adjustment is not really needed, because the checksum over IPv4 header is 0. Lets add a comment to ease code comprehension and avoid copy/paste errors. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 28e589c5f32d..92db7a69f2b9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1369,6 +1369,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, * immediately following this IP hdr. */ + /* Note : No need to call skb_gro_postpull_rcsum() here, + * as we already checked checksum over ipv4 header was 0 + */ skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); -- cgit v1.2.3 From e506d405ac7d34d03996c97ac68aa2ac010be64a Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 1 Oct 2014 13:59:00 +0200 Subject: net: dsa: Fix build warning for !PM_SLEEP The dsa_switch_suspend() and dsa_switch_resume() functions are only used when PM_SLEEP is enabled, so they need #ifdef CONFIG_PM_SLEEP protection to avoid a compiler warning. Signed-off-by: Thierry Reding Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6905f2d84c44..22f34cf4cb27 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -238,6 +238,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { } +#ifdef CONFIG_PM_SLEEP static int dsa_switch_suspend(struct dsa_switch *ds) { int i, ret = 0; @@ -280,6 +281,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) return 0; } +#endif /* link polling *************************************************************/ -- cgit v1.2.3 From 47d7a88c188f06ffaea3a539f84fe10cb4e77787 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 1 Oct 2014 18:27:50 +0200 Subject: tcp: add __init to tcp_init_mem tcp_init_mem is only called by __init tcp_init. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cf5e508e1ef5..5c170340f684 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3172,7 +3172,7 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); -static void tcp_init_mem(void) +static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); -- cgit v1.2.3 From 57a02c39c1c20ed03a86f8014c11a8c18b94cac3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 1 Oct 2014 19:18:57 +0200 Subject: inet: frags: add __init to ip4_frags_ctl_register ip4_frags_ctl_register is only called by __init ipfrag_init Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 15f0e2bad7ad..2811cc18701a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -790,7 +790,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) kfree(table); } -static void ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } @@ -804,7 +804,7 @@ static inline void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void ip4_frags_ctl_register(void) +static inline void __init ip4_frags_ctl_register(void) { } #endif -- cgit v1.2.3 From cb57659a15c6c0576493cc8a10474ce7ffd44eb3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 1 Oct 2014 19:30:03 +0200 Subject: cipso: add __init to cipso_v4_cache_init cipso_v4_cache_init is only called by __init cipso_v4_init Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 05b708bbdb0d..4715f25dfe03 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -246,7 +246,7 @@ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) * success, negative values on error. * */ -static int cipso_v4_cache_init(void) +static int __init cipso_v4_cache_init(void) { u32 iter; -- cgit v1.2.3 From b248230c34970a6c1c17c591d63b464e8d2cfc33 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Sep 2014 13:20:38 -0700 Subject: tcp: abort orphan sockets stalling on zero window probes Currently we have two different policies for orphan sockets that repeatedly stall on zero window ACKs. If a socket gets a zero window ACK when it is transmitting data, the RTO is used to probe the window. The socket is aborted after roughly tcp_orphan_retries() retries (as in tcp_write_timeout()). But if the socket was idle when it received the zero window ACK, and later wants to send more data, we use the probe timer to probe the window. If the receiver always returns zero window ACKs, icsk_probes keeps getting reset in tcp_ack() and the orphan socket can stall forever until the system reaches the orphan limit (as commented in tcp_probe_timer()). This opens up a simple attack to create lots of hanging orphan sockets to burn the memory and the CPU, as demonstrated in the recent netdev post "TCP connection will hang in FIN_WAIT1 after closing if zero window is advertised." http://www.spinics.net/lists/netdev/msg296539.html This patch follows the design in RTO-based probe: we abort an orphan socket stalling on zero window when the probe timer reaches both the maximum backoff and the maximum RTO. For example, an 100ms RTT connection will timeout after roughly 153 seconds (0.3 + 0.6 + .... + 76.8) if the receiver keeps the window shut. If the orphan socket passes this check, but the system already has too many orphans (as in tcp_out_of_resources()), we still abort it but we'll also send an RST packet as the connection may still be active. In addition, we change TCP_USER_TIMEOUT to cover (life or dead) sockets stalled on zero-window probes. This changes the semantics of TCP_USER_TIMEOUT slightly because it previously only applies when the socket has pending transmission. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Reported-by: Andrey Dmitrov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_timer.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c170340f684..26a6f113f00c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2693,7 +2693,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; #endif case TCP_USER_TIMEOUT: - /* Cap the max timeout in ms TCP will retry/retrans + /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b24360f6e293..9b21ae8b2e31 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk) * limit. * 2. If we have strong memory pressure. */ -static int tcp_out_of_resources(struct sock *sk, int do_reset) +static int tcp_out_of_resources(struct sock *sk, bool do_reset) { struct tcp_sock *tp = tcp_sk(sk); int shift = 0; @@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) - do_reset = 1; + do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); @@ -270,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; + u32 start_ts; if (tp->packets_out || !tcp_send_head(sk)) { icsk->icsk_probes_out = 0; return; } - /* *WARNING* RFC 1122 forbids this - * - * It doesn't AFAIK, because we kill the retransmit timer -AK - * - * FIXME: We ought not to do it, Solaris 2.5 actually has fixing - * this behaviour in Solaris down as a bug fix. [AC] - * - * Let me to explain. icsk_probes_out is zeroed by incoming ACKs - * even if they advertise zero window. Hence, connection is killed only - * if we received no ACKs for normal connection timeout. It is not killed - * only because window stays zero for some time, window may be zero - * until armageddon and even later. We are in full accordance - * with RFCs, only probe timer combines both retransmission timeout - * and probe timeout in one bottle. --ANK + /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as + * long as the receiver continues to respond probes. We support this by + * default and reset icsk_probes_out with incoming ACKs. But if the + * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we + * kill the socket when the retry count and the time exceeds the + * corresponding system limit. We also implement similar policy when + * we use RTO to probe window in tcp_retransmit_timer(). */ - max_probes = sysctl_tcp_retries2; + start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + if (!start_ts) + skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); + else if (icsk->icsk_user_timeout && + (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + goto abort; + max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); - - if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) + if (!alive && icsk->icsk_backoff >= max_probes) + goto abort; + if (tcp_out_of_resources(sk, true)) return; } if (icsk->icsk_probes_out > max_probes) { - tcp_write_err(sk); +abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); -- cgit v1.2.3 From d0bf4a9e92b9a93ffeeacbd7b6cb83e0ee3dc2ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Sep 2014 13:29:15 -0700 Subject: net: cleanup and document skb fclone layout Lets use a proper structure to clearly document and implement skb fast clones. Then, we might experiment more easily alternative layouts. This patch adds a new skb_fclone_busy() helper, used by tcp and xfrm, to stop leaking of implementation details. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 25 +++++++++++++++++++++++++ net/core/skbuff.c | 41 ++++++++++++++++++++--------------------- net/ipv4/tcp_output.c | 5 +---- net/xfrm/xfrm_policy.c | 4 +--- 4 files changed, 47 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 262efdbc346b..d8f7d74d5a4d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -781,6 +781,31 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, int *errcode, gfp_t gfp_mask); +/* Layout of fast clones : [skb1][skb2][fclone_ref] */ +struct sk_buff_fclones { + struct sk_buff skb1; + + struct sk_buff skb2; + + atomic_t fclone_ref; +}; + +/** + * skb_fclone_busy - check if fclone is busy + * @skb: buffer + * + * Returns true is skb is a fast clone, and its clone is not freed. + */ +static inline bool skb_fclone_busy(const struct sk_buff *skb) +{ + const struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + return skb->fclone == SKB_FCLONE_ORIG && + fclones->skb2.fclone == SKB_FCLONE_CLONE; +} + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4be570a4ab21..a8cebb40699c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,15 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + atomic_set(&fclones->fclone_ref, 1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE; + fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -524,8 +525,7 @@ static void skb_release_data(struct sk_buff *skb) */ static void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: @@ -533,22 +533,21 @@ static void kfree_skbmem(struct sk_buff *skb) break; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; + fclones = container_of(skb, struct sk_buff_fclones, skb2); /* The clone portion is available for * fast-cloning again. */ skb->fclone = SKB_FCLONE_UNAVAILABLE; - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; } } @@ -859,17 +858,18 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { - struct sk_buff *n; + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); + struct sk_buff *n = &fclones->skb2; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); + atomic_inc(&fclones->fclone_ref); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -3240,8 +3240,7 @@ void __init skb_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ee567e9e98c3..8d4eac793700 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2110,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - const struct sk_buff *fclone = skb + 1; - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f623dca6ce30..4c4e457e7888 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1961,10 +1961,8 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - const struct sk_buff *fclone = skb + 1; - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { kfree_skb(skb); return 0; } -- cgit v1.2.3 From 0c5b8a46294d43fc63788839d3c18de0961ec1bc Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 1 Oct 2014 06:48:03 +0200 Subject: net/dccp/proto.c: add __init to dccp_mib_init dccp_mib_init is only called by __init dccp_init in same module. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/dccp/proto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index de2c1e719305..1ad150ed57cf 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1082,7 +1082,7 @@ void dccp_shutdown(struct sock *sk, int how) EXPORT_SYMBOL_GPL(dccp_shutdown); -static inline int dccp_mib_init(void) +static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) -- cgit v1.2.3 From e500f488c27659bb6f5d313b336621f3daa67701 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 1 Oct 2014 06:52:06 +0200 Subject: net/dccp/ccid.c: add __init to ccid_activate ccid_activate is only called by __init ccid_initialize_builtins in same module. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/dccp/ccid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 597557254ddb..83498975165f 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -99,7 +99,7 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) kmem_cache_destroy(slab); } -static int ccid_activate(struct ccid_operations *ccid_ops) +static int __init ccid_activate(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; -- cgit v1.2.3 From ce1a4ea3f125863bfbcb1afb76590ee2b7b93fbf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Oct 2014 15:27:15 -0700 Subject: net: avoid one atomic operation in skb_clone() Fast clone cloning can actually avoid an atomic_inc(), if we guarantee prior clone_ref value is 1. This requires a change kfree_skbmem(), to perform the atomic_dec_and_test() on clone_ref before setting fclone to SKB_FCLONE_UNAVAILABLE. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a8cebb40699c..f77e64873caf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -541,13 +541,20 @@ static void kfree_skbmem(struct sk_buff *skb) case SKB_FCLONE_CLONE: fclones = container_of(skb, struct sk_buff_fclones, skb2); - /* The clone portion is available for - * fast-cloning again. + /* Warning : We must perform the atomic_dec_and_test() before + * setting skb->fclone back to SKB_FCLONE_UNAVAILABLE, otherwise + * skb_clone() could set clone_ref to 2 before our decrement. + * Anyway, if we are going to free the structure, no need to + * rewrite skb->fclone. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; - - if (atomic_dec_and_test(&fclones->fclone_ref)) + if (atomic_dec_and_test(&fclones->fclone_ref)) { kmem_cache_free(skbuff_fclone_cache, fclones); + } else { + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + } break; } } @@ -869,7 +876,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { n->fclone = SKB_FCLONE_CLONE; - atomic_inc(&fclones->fclone_ref); + /* As our fastclone was free, clone_ref must be 1 at this point. + * We could use atomic_inc() here, but it is faster + * to set the final value. + */ + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; -- cgit v1.2.3 From 8bce6d7d0d1ede22af334ee241841e9278365278 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Sep 2014 20:22:29 -0700 Subject: udp: Generalize skb_udp_segment skb_udp_segment is the function called from udp4_ufo_fragment to segment a UDP tunnel packet. This function currently assumes segmentation is transparent Ethernet bridging (i.e. VXLAN encapsulation). This patch generalizes the function to operate on either Ethertype or IP protocol. The inner_protocol field must be set to the protocol of the inner header. This can now be either an Ethertype or an IP protocol (in a union). A new flag in the skbuff indicates which type is effective. skb_set_inner_protocol and skb_set_inner_ipproto helper functions were added to set the inner_protocol. These functions are called from the point where the tunnel encapsulation is occuring. When skb_udp_tunnel_segment is called, the function to segment the inner packet is selected based on the inner IP or Ethertype. In the case of an IP protocol encapsulation, the function is derived from inet[6]_offloads. In the case of Ethertype, skb->protocol is set to the inner_protocol and skb_mac_gso_segment is called. (GRE currently does this, but it might be possible to lookup the protocol in offload_base and call the appropriate segmenation function directly). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 26 +++++++++++++++++++++++-- include/net/udp.h | 3 ++- net/ipv4/udp_offload.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- net/ipv6/udp_offload.c | 2 +- 4 files changed, 73 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8f7d74d5a4d..7c5036d11feb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -596,7 +596,8 @@ struct sk_buff { __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; - /* 5 or 7 bit hole */ + __u8 inner_protocol_type:1; + /* 4 or 6 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -632,7 +633,11 @@ struct sk_buff { __u32 reserved_tailroom; }; - __be16 inner_protocol; + union { + __be16 inner_protocol; + __u8 inner_ipproto; + }; + __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; @@ -1762,6 +1767,23 @@ static inline void skb_reserve(struct sk_buff *skb, int len) skb->tail += len; } +#define ENCAP_TYPE_ETHER 0 +#define ENCAP_TYPE_IPPROTO 1 + +static inline void skb_set_inner_protocol(struct sk_buff *skb, + __be16 protocol) +{ + skb->inner_protocol = protocol; + skb->inner_protocol_type = ENCAP_TYPE_ETHER; +} + +static inline void skb_set_inner_ipproto(struct sk_buff *skb, + __u8 ipproto) +{ + skb->inner_ipproto = ipproto; + skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; +} + static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; diff --git a/include/net/udp.h b/include/net/udp.h index 16f4e80f0519..07f9b70962f6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -239,7 +239,8 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features); + netdev_features_t features, + bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 19ebe6a39ddc..8c35f2c939ee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,8 +25,11 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features), + __be16 new_protocol) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -48,7 +51,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); + skb->protocol = new_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); if (need_csum) @@ -56,7 +59,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); + segs = gso_inner_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -101,6 +104,44 @@ out: return segs; } +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + bool is_ipv6) +{ + __be16 protocol = skb->protocol; + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features); + + rcu_read_lock(); + + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = skb->inner_protocol; + gso_inner_segment = skb_mac_gso_segment; + break; + case ENCAP_TYPE_IPPROTO: + offloads = is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[skb->inner_ipproto]); + if (!ops || !ops->callbacks.gso_segment) + goto out_unlock; + gso_inner_segment = ops->callbacks.gso_segment; + break; + default: + goto out_unlock; + } + + segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, + protocol); + +out_unlock: + rcu_read_unlock(); + + return segs; +} + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -113,7 +154,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, false); goto out; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 212ebfc7973f..8f96988c1db2 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -58,7 +58,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, true); else { const struct ipv6hdr *ipv6h; struct udphdr *uh; -- cgit v1.2.3 From 469471cdfc1902861fedafe8c5c1c8dbf5ad6ba6 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Sep 2014 20:22:30 -0700 Subject: sit: Set inner IP protocol in sit Call skb_set_inner_ipproto to set inner IP protocol to IPPROTO_IPV6 before tunnel_xmit. This is needed if UDP encapsulation (fou) is being done. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/sit.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index db75809ab843..0d4e27466f82 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -982,6 +982,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } + skb_set_inner_ipproto(skb, IPPROTO_IPV6); + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); @@ -1006,6 +1008,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (IS_ERR(skb)) goto out; + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); return NETDEV_TX_OK; out: -- cgit v1.2.3 From 077c5a0948cc7b75032288bd37bd6641ef05da76 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Sep 2014 20:22:31 -0700 Subject: ipip: Set inner IP protocol in ipip Call skb_set_inner_ipproto to set inner IP protocol to IPPROTO_IPV4 before tunnel_xmit. This is needed if UDP encapsulation (fou) is being done. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bfec31df8b21..ea88ab3102a8 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -224,6 +224,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (IS_ERR(skb)) goto out; + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; -- cgit v1.2.3 From 54bc9bac307861aea5abb747cb58bf0275f2175f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Sep 2014 20:22:32 -0700 Subject: gre: Set inner protocol in v4 and v6 GRE transmit Call skb_set_inner_protocol to set inner Ethernet protocol to protocol being encapsulation by GRE before tunnel_xmit. This is needed for GSO if UDP encapsulation (fou) is being done. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 ++ net/ipv6/ip6_gre.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 829aff8bf723..0485ef18d254 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -241,6 +241,8 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, /* Push GRE header. */ gre_build_header(skb, &tpi, tunnel->tun_hlen); + skb_set_inner_protocol(skb, tpi.proto); + ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 5f19dfbc4c6a..9a0a1aafe727 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -616,6 +616,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, int err = -1; u8 proto; struct sk_buff *new_skb; + __be16 protocol; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -732,8 +733,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ipv6h->daddr = fl6->daddr; ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags; - ((__be16 *)(ipv6h + 1))[1] = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : skb->protocol; + protocol = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; + ((__be16 *)(ipv6h + 1))[1] = protocol; if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4); @@ -754,6 +756,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } } + skb_set_inner_protocol(skb, protocol); + ip6tunnel_xmit(skb, dev); if (ndst) ip6_tnl_dst_store(tunnel, ndst); -- cgit v1.2.3 From 6e0565697a106f2453b636da1ca481d9fe068bac Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 30 Sep 2014 16:07:23 -0700 Subject: net_sched: fix another crash in cls_tcindex This patch fixes the following crash: [ 166.670795] BUG: unable to handle kernel NULL pointer dereference at (null) [ 166.674230] IP: [] __list_del_entry+0x5c/0x98 [ 166.674230] PGD d0ea5067 PUD ce7fc067 PMD 0 [ 166.674230] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 166.674230] CPU: 1 PID: 775 Comm: tc Not tainted 3.17.0-rc6+ #642 [ 166.674230] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 166.674230] task: ffff8800d03c4d20 ti: ffff8800cae7c000 task.ti: ffff8800cae7c000 [ 166.674230] RIP: 0010:[] [] __list_del_entry+0x5c/0x98 [ 166.674230] RSP: 0018:ffff8800cae7f7d0 EFLAGS: 00010207 [ 166.674230] RAX: 0000000000000000 RBX: ffff8800cba8d700 RCX: ffff8800cba8d700 [ 166.674230] RDX: 0000000000000000 RSI: dead000000200200 RDI: ffff8800cba8d700 [ 166.674230] RBP: ffff8800cae7f7d0 R08: 0000000000000001 R09: 0000000000000001 [ 166.674230] R10: 0000000000000000 R11: 000000000000859a R12: ffffffffffffffe8 [ 166.674230] R13: ffff8800cba8c5b8 R14: 0000000000000001 R15: ffff8800cba8d700 [ 166.674230] FS: 00007fdb5f04a740(0000) GS:ffff88011a800000(0000) knlGS:0000000000000000 [ 166.674230] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 166.674230] CR2: 0000000000000000 CR3: 00000000cf929000 CR4: 00000000000006e0 [ 166.674230] Stack: [ 166.674230] ffff8800cae7f7e8 ffffffff814b73e8 ffff8800cba8d6e8 ffff8800cae7f828 [ 166.674230] ffffffff817caeec 0000000000000046 ffff8800cba8c5b0 ffff8800cba8c5b8 [ 166.674230] 0000000000000000 0000000000000001 ffff8800cf8e33e8 ffff8800cae7f848 [ 166.674230] Call Trace: [ 166.674230] [] list_del+0xd/0x2b [ 166.674230] [] tcf_action_destroy+0x4c/0x71 [ 166.674230] [] tcf_exts_destroy+0x20/0x2d [ 166.674230] [] tcindex_delete+0x196/0x1b7 struct list_head can not be simply copied and we should always init it. Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 8d0e83d6903e..30f10fb07f4a 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -254,10 +254,15 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->tp = tp; if (p->perfect) { + int i; + cp->perfect = kmemdup(p->perfect, sizeof(*r) * cp->hash, GFP_KERNEL); if (!cp->perfect) goto errout; + for (i = 0; i < cp->hash; i++) + tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); balloc = 1; } cp->h = p->h; @@ -353,6 +358,9 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) goto errout_alloc; + f->key = handle; + tcindex_filter_result_init(&f->result); + f->next = NULL; } if (tb[TCA_TCINDEX_CLASSID]) { @@ -376,9 +384,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; - f->key = handle; - f->result = new_filter_result; - f->next = NULL; + tcf_exts_change(tp, &f->result.exts, &r->exts); fp = cp->h + (handle % cp->hash); for (nfp = rtnl_dereference(*fp); -- cgit v1.2.3 From a0efb80ce3abacfd22a4284c3730924fc2f1f077 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 30 Sep 2014 16:07:24 -0700 Subject: net_sched: avoid calling tcf_unbind_filter() in call_rcu callback This fixes the following crash: [ 63.976822] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 63.980094] CPU: 1 PID: 15 Comm: ksoftirqd/1 Not tainted 3.17.0-rc6+ #648 [ 63.980094] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 63.980094] task: ffff880117dea690 ti: ffff880117dfc000 task.ti: ffff880117dfc000 [ 63.980094] RIP: 0010:[] [] u32_destroy_key+0x27/0x6d [ 63.980094] RSP: 0018:ffff880117dffcc0 EFLAGS: 00010202 [ 63.980094] RAX: ffff880117dea690 RBX: ffff8800d02e0820 RCX: 0000000000000000 [ 63.980094] RDX: 0000000000000001 RSI: 0000000000000002 RDI: 6b6b6b6b6b6b6b6b [ 63.980094] RBP: ffff880117dffcd0 R08: 0000000000000000 R09: 0000000000000000 [ 63.980094] R10: 00006c0900006ba8 R11: 00006ba100006b9d R12: 0000000000000001 [ 63.980094] R13: ffff8800d02e0898 R14: ffffffff817e6d4d R15: ffff880117387a30 [ 63.980094] FS: 0000000000000000(0000) GS:ffff88011a800000(0000) knlGS:0000000000000000 [ 63.980094] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 63.980094] CR2: 00007f07e6732fed CR3: 000000011665b000 CR4: 00000000000006e0 [ 63.980094] Stack: [ 63.980094] ffff88011a9cd300 ffffffff82051ac0 ffff880117dffce0 ffffffff817e6d68 [ 63.980094] ffff880117dffd70 ffffffff810cb4c7 ffffffff810cb3cd ffff880117dfffd8 [ 63.980094] ffff880117dea690 ffff880117dea690 ffff880117dfffd8 000000000000000a [ 63.980094] Call Trace: [ 63.980094] [] u32_delete_key_freepf_rcu+0x1b/0x1d [ 63.980094] [] rcu_process_callbacks+0x3bb/0x691 [ 63.980094] [] ? rcu_process_callbacks+0x2c1/0x691 [ 63.980094] [] ? u32_destroy_key+0x6d/0x6d [ 63.980094] [] __do_softirq+0x142/0x323 [ 63.980094] [] run_ksoftirqd+0x23/0x53 [ 63.980094] [] smpboot_thread_fn+0x203/0x221 [ 63.980094] [] ? smpboot_unpark_thread+0x33/0x33 [ 63.980094] [] kthread+0xc9/0xd1 [ 63.980094] [] ? do_wait_for_common+0xf8/0x125 [ 63.980094] [] ? __kthread_parkme+0x61/0x61 [ 63.980094] [] ret_from_fork+0x7c/0xb0 [ 63.980094] [] ? __kthread_parkme+0x61/0x61 tp could be freed in call_rcu callback too, the order is not guaranteed. John Fastabend says: ==================== Its worth noting why this is safe. Any running schedulers will either read the valid class field or it will be zeroed. All schedulers today when the class is 0 do a lookup using the same call used by the tcf_exts_bind(). So even if we have a running classifier hit the null class pointer it will do a lookup and get to the same result. This is particularly fragile at the moment because the only way to verify this is to audit the schedulers call sites. ==================== Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 +----- net/sched/cls_u32.c | 10 ++++++---- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 73f9532ee581..ef44ad9a6426 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -20,11 +20,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { - unsigned long old_cl; - - old_cl = *clp; - *clp = cl; - return old_cl; + return xchg(clp, cl); } static inline unsigned long diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4be3ebf46d67..0472909bb014 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -358,7 +358,6 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, bool free_pf) { - tcf_unbind_filter(tp, &n->res); tcf_exts_destroy(&n->exts); if (n->ht_down) n->ht_down->refcnt--; @@ -416,6 +415,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); + tcf_unbind_filter(tp, &key->res); call_rcu(&key->rcu, u32_delete_key_freepf_rcu); return 0; } @@ -425,7 +425,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } -static void u32_clear_hnode(struct tc_u_hnode *ht) +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; unsigned int h; @@ -434,6 +434,7 @@ static void u32_clear_hnode(struct tc_u_hnode *ht) while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); + tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -447,7 +448,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) WARN_ON(ht->refcnt); - u32_clear_hnode(ht); + u32_clear_hnode(tp, ht); hn = &tp_c->hlist; for (phn = rtnl_dereference(*hn); @@ -482,7 +483,7 @@ static void u32_destroy(struct tcf_proto *tp) ht; ht = rtnl_dereference(ht->next)) { ht->refcnt--; - u32_clear_hnode(ht); + u32_clear_hnode(tp, ht); } while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { @@ -731,6 +732,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } u32_replace_knode(tp, tp_c, new); + tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); return 0; } -- cgit v1.2.3 From 775dd692bd34f9201ed2aa775a0edcba4f973f3e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 30 Sep 2014 16:13:19 -0700 Subject: net: bridge: add a br_set_state helper function In preparation for being able to propagate port states to e.g: notifiers or other kernel parts, do not manipulate the port state directly, but instead use a helper function which will allow us to do a bit more than just setting the state. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/bridge/br_if.c | 2 +- net/bridge/br_netlink.c | 2 +- net/bridge/br_private.h | 1 + net/bridge/br_stp.c | 15 ++++++++++----- net/bridge/br_stp_if.c | 4 ++-- net/bridge/br_stp_timer.c | 4 ++-- 6 files changed, 17 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a9f54a9b6690..7b7289ca2992 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -332,7 +332,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD; br_init_port(p); - p->state = BR_STATE_DISABLED; + br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); br_multicast_add_port(p); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 0fa66b83685f..2ff9706647f2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -301,7 +301,7 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; - p->state = state; + br_set_state(p, state); br_log_state(p); br_port_state_selection(p->br); return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f53592fc3ef9..fe7463c2af9a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -766,6 +766,7 @@ static inline void br_nf_core_fini(void) {} /* br_stp.c */ void br_log_state(const struct net_bridge_port *p); +void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); void br_become_designated_port(struct net_bridge_port *p); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 3c86f0538cbb..2b047bcf42a4 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -36,6 +36,11 @@ void br_log_state(const struct net_bridge_port *p) br_port_state_names[p->state]); } +void br_set_state(struct net_bridge_port *p, unsigned int state) +{ + p->state = state; +} + /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { @@ -107,7 +112,7 @@ static void br_root_port_block(const struct net_bridge *br, br_notice(br, "port %u(%s) tried to become root port (blocked)", (unsigned int) p->port_no, p->dev->name); - p->state = BR_STATE_LISTENING; + br_set_state(p, BR_STATE_LISTENING); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -387,7 +392,7 @@ static void br_make_blocking(struct net_bridge_port *p) p->state == BR_STATE_LEARNING) br_topology_change_detection(p->br); - p->state = BR_STATE_BLOCKING; + br_set_state(p, BR_STATE_BLOCKING); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -404,13 +409,13 @@ static void br_make_forwarding(struct net_bridge_port *p) return; if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { - p->state = BR_STATE_FORWARDING; + br_set_state(p, BR_STATE_FORWARDING); br_topology_change_detection(br); del_timer(&p->forward_delay_timer); } else if (br->stp_enabled == BR_KERNEL_STP) - p->state = BR_STATE_LISTENING; + br_set_state(p, BR_STATE_LISTENING); else - p->state = BR_STATE_LEARNING; + br_set_state(p, BR_STATE_LEARNING); br_multicast_enable_port(p); br_log_state(p); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 189ba1e7d851..41146872c1b4 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -37,7 +37,7 @@ void br_init_port(struct net_bridge_port *p) { p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); - p->state = BR_STATE_BLOCKING; + br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; } @@ -100,7 +100,7 @@ void br_stp_disable_port(struct net_bridge_port *p) wasroot = br_is_root_bridge(br); br_become_designated_port(p); - p->state = BR_STATE_DISABLED; + br_set_state(p, BR_STATE_DISABLED); p->topology_change_ack = 0; p->config_pending = 0; diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 558c46d19e05..4fcaa67750fd 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -87,11 +87,11 @@ static void br_forward_delay_timer_expired(unsigned long arg) (unsigned int) p->port_no, p->dev->name); spin_lock(&br->lock); if (p->state == BR_STATE_LISTENING) { - p->state = BR_STATE_LEARNING; + br_set_state(p, BR_STATE_LEARNING); mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } else if (p->state == BR_STATE_LEARNING) { - p->state = BR_STATE_FORWARDING; + br_set_state(p, BR_STATE_FORWARDING); if (br_is_designated_for_some_port(br)) br_topology_change_detection(br); netif_carrier_on(br->dev); -- cgit v1.2.3 From 38b2cf2982dc73d3f07fe84fec8cc4ed9f64c1c5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 Sep 2014 17:53:21 -0700 Subject: net: pktgen: packet bursting via skb->xmit_more This patch demonstrates the effect of delaying update of HW tailptr. (based on earlier patch by Jesper) burst=1 is the default. It sends one packet with xmit_more=false burst=2 sends one packet with xmit_more=true and 2nd copy of the same packet with xmit_more=false burst=3 sends two copies of the same packet with xmit_more=true and 3rd copy with xmit_more=false Performance with ixgbe (usec 30): burst=1 tx:9.2 Mpps burst=2 tx:13.5 Mpps burst=3 tx:14.5 Mpps full 10G line rate Signed-off-by: Alexei Starovoitov Signed-off-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- Documentation/networking/pktgen.txt | 3 +++ net/core/pktgen.c | 26 ++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt index 0dffc6e37902..6915c6b27869 100644 --- a/Documentation/networking/pktgen.txt +++ b/Documentation/networking/pktgen.txt @@ -99,6 +99,9 @@ Examples: pgset "clone_skb 1" sets the number of copies of the same packet pgset "clone_skb 0" use single SKB for all transmits + pgset "burst 8" uses xmit_more API to queue 8 copies of the same + packet and update HW tx queue tail pointer once. + "burst 1" is the default pgset "pkt_size 9014" sets packet size to 9014 pgset "frags 5" packet will consist of 5 fragments pgset "count 200000" sets number of packets to send, set to zero diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5c728aaf8d6c..443256bdcddc 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -387,6 +387,7 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; __u32 skb_priority; /* skb priority field */ + unsigned int burst; /* number of duplicated packets to burst */ int node; /* Memory node */ #ifdef CONFIG_XFRM @@ -613,6 +614,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->traffic_class) seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + if (pkt_dev->burst > 1) + seq_printf(seq, " burst: %d\n", pkt_dev->burst); + if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); @@ -1124,6 +1128,16 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->dst_mac_count); return count; } + if (!strcmp(name, "burst")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->burst = value < 1 ? 1 : value; + sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); + return count; + } if (!strcmp(name, "node")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -3297,6 +3311,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { + unsigned int burst = ACCESS_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; int ret; @@ -3347,8 +3362,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; goto unlock; } - atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev, txq, false); + atomic_add(burst, &pkt_dev->skb->users); + +xmit_more: + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); switch (ret) { case NETDEV_TX_OK: @@ -3356,6 +3373,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->sofar++; pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq)) + goto xmit_more; break; case NET_XMIT_DROP: case NET_XMIT_CN: @@ -3374,6 +3393,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) atomic_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } + if (unlikely(burst)) + atomic_sub(burst, &pkt_dev->skb->users); unlock: HARD_TX_UNLOCK(odev, txq); @@ -3572,6 +3593,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; + pkt_dev->burst = 1; pkt_dev->node = -1; err = pktgen_setup_dev(t->net, pkt_dev, ifname); -- cgit v1.2.3 From 02e246aee868e982eecc25ee97d02acf2c2b8884 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 2 Oct 2014 10:16:22 +0300 Subject: Bluetooth: Fix lockdep warning with l2cap_chan_connect The L2CAP connection's channel list lock (conn->chan_lock) must never be taken while already holding a channel lock (chan->lock) in order to avoid lock-inversion and lockdep warnings. So far the l2cap_chan_connect function has acquired the chan->lock early in the function and then later called l2cap_chan_add(conn, chan) which will try to take the conn->chan_lock. This violates the correct order of taking the locks and may lead to the following type of lockdep warnings: -> #1 (&conn->chan_lock){+.+...}: [] lock_acquire+0x9d/0x140 [] mutex_lock_nested+0x6c/0x420 [] l2cap_chan_add+0x1e/0x40 [bluetooth] [] l2cap_chan_connect+0x348/0x8f0 [bluetooth] [] lowpan_control_write+0x221/0x2d0 [bluetooth_6lowpan] -> #0 (&chan->lock){+.+.+.}: [] __lock_acquire+0x1a18/0x1d20 [] lock_acquire+0x9d/0x140 [] mutex_lock_nested+0x6c/0x420 [] l2cap_connect_cfm+0x1dd/0x3f0 [bluetooth] [] hci_le_meta_evt+0x11a4/0x1260 [bluetooth] [] hci_event_packet+0x3ab/0x3120 [bluetooth] [] hci_rx_work+0x208/0x4a0 [bluetooth] CPU0 CPU1 ---- ---- lock(&conn->chan_lock); lock(&chan->lock); lock(&conn->chan_lock); lock(&chan->lock); Before calling l2cap_chan_add() the channel is not part of the conn->chan_l list, and can therefore only be accessed by the L2CAP user (such as l2cap_sock.c). We can therefore assume that it is the responsibility of the user to handle mutual exclusion until this point (which we can see is already true in l2cap_sock.c by it in many places touching chan members without holding chan->lock). Since the hci_conn and by exctension l2cap_conn creation in the l2cap_chan_connect() function depend on chan details we cannot simply add a mutex_lock(&conn->chan_lock) in the beginning of the function (since the conn object doesn't yet exist there). What we can do however is move the chan->lock taking later into the function where we already have the conn object and can that way take conn->chan_lock first. This patch implements the above strategy and does some other necessary changes such as using __l2cap_chan_add() which assumes conn->chan_lock is held, as well as adding a second needed label so the unlocking happens as it should. Reported-by: Jukka Rissanen Signed-off-by: Johan Hedberg Tested-by: Jukka Rissanen Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 8d53fc57faba..b6f9777e057d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6980,8 +6980,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, hci_dev_lock(hdev); - l2cap_chan_lock(chan); - if (!is_valid_psm(__le16_to_cpu(psm), dst_type) && !cid && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; @@ -7078,17 +7076,20 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, goto done; } + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(chan); + if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { hci_conn_drop(hcon); err = -EBUSY; - goto done; + goto chan_unlock; } /* Update source addr of the socket */ bacpy(&chan->src, &hcon->src); chan->src_type = bdaddr_type(hcon, hcon->src_type); - l2cap_chan_add(conn, chan); + __l2cap_chan_add(conn, chan); /* l2cap_chan_add takes its own ref so we can drop this one */ hci_conn_drop(hcon); @@ -7114,8 +7115,10 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, err = 0; -done: +chan_unlock: l2cap_chan_unlock(chan); + mutex_unlock(&conn->chan_lock); +done: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; -- cgit v1.2.3 From fc12518a4bcbd4214652291df76f692343bca3d5 Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Wed, 1 Oct 2014 11:30:26 +0300 Subject: Bluetooth: 6lowpan: Memory leak as the skb is not freed The earlier multicast commit 36b3dd250dde ("Bluetooth: 6lowpan: Ensure header compression does not corrupt IPv6 header") lost one skb free which then caused memory leak. Signed-off-by: Jukka Rissanen Signed-off-by: Johan Hedberg --- net/bluetooth/6lowpan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index f0432aea8dad..add2b58312d7 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -625,6 +625,8 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) send_mcast_pkt(skb, netdev); } + dev_kfree_skb(skb); + if (err) BT_DBG("ERROR: xmit failed (%d)", err); -- cgit v1.2.3 From a7807d73a0fa9b33dbdfd5f1cb97970ccc91d77e Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Wed, 1 Oct 2014 11:30:57 +0300 Subject: Bluetooth: 6lowpan: Avoid memory leak if memory allocation fails If skb_unshare() returns NULL, then we leak the original skb. Solution is to use temp variable to hold the new skb. Signed-off-by: Jukka Rissanen Signed-off-by: Johan Hedberg --- net/bluetooth/6lowpan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index add2b58312d7..44eaad77e70c 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -589,13 +589,17 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) int err = 0; bdaddr_t addr; u8 addr_type; + struct sk_buff *tmpskb; /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) + tmpskb = skb_unshare(skb, GFP_ATOMIC); + if (!tmpskb) { + kfree_skb(skb); return NET_XMIT_DROP; + } + skb = tmpskb; /* Return values from setup_header() * <0 - error, packet is dropped -- cgit v1.2.3 From d7b6b0a532da7de25e16deed610658cfa1969fe9 Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Wed, 1 Oct 2014 15:59:14 +0300 Subject: Bluetooth: 6lowpan: Return EAGAIN error also for multicast packets Make sure that we are able to return EAGAIN from l2cap_chan_send() even for multicast packets. The error code was ignored unncessarily. Signed-off-by: Jukka Rissanen Signed-off-by: Johan Hedberg --- net/bluetooth/6lowpan.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 44eaad77e70c..fcfaa7006b28 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -509,21 +509,16 @@ static int header_create(struct sk_buff *skb, struct net_device *netdev, /* Packet to BT LE device */ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, - struct net_device *netdev, bool is_mcast) + struct net_device *netdev) { struct msghdr msg; struct kvec iv; int err; /* Remember the skb so that we can send EAGAIN to the caller if - * we run out of credits. This is not done for multicast packets - * because we generate mcast packet in this module and are not - * really able to remember the skb after this packet is sent. + * we run out of credits. */ - if (is_mcast) - chan->data = NULL; - else - chan->data = skb; + chan->data = skb; memset(&msg, 0, sizeof(msg)); msg.msg_iov = (struct iovec *) &iv; @@ -575,7 +570,7 @@ static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); - send_pkt(pentry->chan, local_skb, netdev, true); + send_pkt(pentry->chan, local_skb, netdev); kfree_skb(local_skb); } @@ -617,8 +612,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); - err = send_pkt(lowpan_cb(skb)->chan, skb, netdev, - false); + err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); } else { err = -ENOENT; } -- cgit v1.2.3 From 9c238ca8ec79c38ab22762b44aeaf7a42fc97b18 Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Wed, 1 Oct 2014 15:59:15 +0300 Subject: Bluetooth: 6lowpan: Check transmit errors for multicast packets We did not return error if multicast packet transmit failed. This might not be desired so return error also in this case. If there are multiple 6lowpan devices where the multicast packet is sent, then return error even if sending to only one of them fails. Signed-off-by: Jukka Rissanen Signed-off-by: Johan Hedberg --- net/bluetooth/6lowpan.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index fcfaa7006b28..c2e0d14433df 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -546,11 +546,12 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, return err; } -static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) +static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; struct lowpan_dev *entry, *tmp; unsigned long flags; + int err = 0; read_lock_irqsave(&devices_lock, flags); @@ -564,19 +565,25 @@ static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) dev = lowpan_dev(entry->netdev); list_for_each_entry_safe(pentry, ptmp, &dev->peers, list) { + int ret; + local_skb = skb_clone(skb, GFP_ATOMIC); BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); - send_pkt(pentry->chan, local_skb, netdev); + ret = send_pkt(pentry->chan, local_skb, netdev); + if (ret < 0) + err = ret; kfree_skb(local_skb); } } read_unlock_irqrestore(&devices_lock, flags); + + return err; } static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) @@ -620,7 +627,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) /* We need to send the packet to every device behind this * interface. */ - send_mcast_pkt(skb, netdev); + err = send_mcast_pkt(skb, netdev); } dev_kfree_skb(skb); -- cgit v1.2.3 From 51b0a5d8c21a91801bbef9bcc8639dc0b206c6cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Sep 2014 14:35:14 +0200 Subject: netfilter: nft_reject: introduce icmp code abstraction for inet and bridge This patch introduces the NFT_REJECT_ICMPX_UNREACH type which provides an abstraction to the ICMP and ICMPv6 codes that you can use from the inet and bridge tables, they are: * NFT_REJECT_ICMPX_NO_ROUTE: no route to host - network unreachable * NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable * NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable * NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratevely prohibited You can still use the specific codes when restricting the rule to match the corresponding layer 3 protocol. I decided to not overload the existing NFT_REJECT_ICMP_UNREACH to have different semantics depending on the table family and to allow the user to specify ICMP family specific codes if they restrict it to the corresponding family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 1 + include/net/netfilter/nft_reject.h | 9 +-- include/uapi/linux/netfilter/nf_tables.h | 21 +++++++ net/bridge/netfilter/nft_reject_bridge.c | 95 ++++++++++++++++++++++++++++++-- net/ipv4/netfilter/nft_reject_ipv4.c | 1 - net/netfilter/nft_reject.c | 37 +++++++++++++ net/netfilter/nft_reject_inet.c | 94 +++++++++++++++++++++++++++++-- 7 files changed, 241 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index f713b5a31d62..8ce06385a552 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -5,6 +5,7 @@ #include #include #include +#include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) { diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 36b0da2d55bb..60fa1530006b 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -14,12 +14,7 @@ int nft_reject_init(const struct nft_ctx *ctx, int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); -void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt); - -void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt); +int nft_reject_icmp_code(u8 code); +int nft_reject_icmpv6_code(u8 code); #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b72ccfeaf865..c26df6787fb0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,12 +749,33 @@ enum nft_queue_attributes { * * @NFT_REJECT_ICMP_UNREACH: reject using ICMP unreachable * @NFT_REJECT_TCP_RST: reject using TCP RST + * @NFT_REJECT_ICMPX_UNREACH: abstracted ICMP unreachable for bridge and inet */ enum nft_reject_types { NFT_REJECT_ICMP_UNREACH, NFT_REJECT_TCP_RST, + NFT_REJECT_ICMPX_UNREACH, }; +/** + * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * + * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable + * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable + * @NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable + * @NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratively prohibited + * + * These codes are mapped to real ICMP and ICMPv6 codes. + */ +enum nft_reject_inet_code { + NFT_REJECT_ICMPX_NO_ROUTE = 0, + NFT_REJECT_ICMPX_PORT_UNREACH, + NFT_REJECT_ICMPX_HOST_UNREACH, + NFT_REJECT_ICMPX_ADMIN_PROHIBITED, + __NFT_REJECT_ICMPX_MAX +}; +#define NFT_REJECT_ICMPX_MAX (__NFT_REJECT_ICMPX_MAX + 1) + /** * enum nft_reject_attributes - nf_tables reject expression netlink attributes * diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index ee3ffe93e14e..a76479535df2 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -14,21 +14,106 @@ #include #include #include +#include +#include static void nft_reject_bridge_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (eth_hdr(pkt->skb)->h_proto) { case htons(ETH_P_IP): - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case htons(ETH_P_IPV6): - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; default: /* No explicit way to reject this protocol, drop it. */ - data[NFT_REG_VERDICT].verdict = NF_DROP; break; } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_bridge_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + return 0; +} + +static int nft_reject_bridge_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_bridge_type; @@ -36,8 +121,8 @@ static const struct nft_expr_ops nft_reject_bridge_ops = { .type = &nft_reject_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_bridge_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_bridge_init, + .dump = nft_reject_bridge_dump, }; static struct nft_expr_type nft_reject_bridge_type __read_mostly = { diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e79718a382f2..ed33299c56d1 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f3448c296446..ec8a456092a7 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, @@ -70,5 +72,40 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_reject_dump); +static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, +}; + +int nft_reject_icmp_code(u8 code) +{ + if (code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + return icmp_code_v4[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmp_code); + + +static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, +}; + +int nft_reject_icmpv6_code(u8 code) +{ + if (code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + return icmp_code_v6[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index b718a52a4654..7b5f9d58680a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -14,17 +14,103 @@ #include #include #include +#include +#include static void nft_reject_inet_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (pkt->ops->pf) { case NFPROTO_IPV4: - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case NFPROTO_IPV6: - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; + } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_inet_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; } + return 0; +} + +static int nft_reject_inet_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_inet_type; @@ -32,8 +118,8 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_inet_init, + .dump = nft_reject_inet_dump, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { -- cgit v1.2.3 From c8d7b98bec43faaa6583c3135030be5eb4693acb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Sep 2014 14:35:15 +0200 Subject: netfilter: move nf_send_resetX() code to nf_reject_ipvX modules Move nf_send_reset() and nf_send_reset6() to nf_reject_ipv4 and nf_reject_ipv6 respectively. This code is shared by x_tables and nf_tables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 118 +----------------------- net/ipv4/netfilter/Kconfig | 6 ++ net/ipv4/netfilter/Makefile | 3 + net/ipv4/netfilter/nf_reject_ipv4.c | 127 +++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 ++ net/ipv6/netfilter/Makefile | 3 + net/ipv6/netfilter/nf_reject_ipv6.c | 163 +++++++++++++++++++++++++++++++++ 7 files changed, 309 insertions(+), 117 deletions(-) create mode 100644 net/ipv4/netfilter/nf_reject_ipv4.c create mode 100644 net/ipv6/netfilter/nf_reject_ipv6.c (limited to 'net') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 8ce06385a552..e8427193c777 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -1,10 +1,6 @@ #ifndef _IPV4_NF_REJECT_H #define _IPV4_NF_REJECT_H -#include -#include -#include -#include #include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) @@ -12,118 +8,6 @@ static inline void nf_send_unreach(struct sk_buff *skb_in, int code) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } -/* Send RST reply */ -static void nf_send_reset(struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); - if (oth == NULL) - return; - - /* No RST for RST. */ - if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; - - /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; - - skb_reserve(nskb, LL_MAX_HEADER); - - skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - niph->version = 4; - niph->ihl = sizeof(struct iphdr) / 4; - niph->tos = 0; - niph->id = 0; - niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; - niph->check = 0; - niph->saddr = oiph->daddr; - niph->daddr = oiph->saddr; - - skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); - tcph->source = oth->dest; - tcph->dest = oth->source; - tcph->doff = sizeof(struct tcphdr) / 4; - - if (oth->ack) - tcph->seq = oth->ack_seq; - else { - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + - oldskb->len - ip_hdrlen(oldskb) - - (oth->doff << 2)); - tcph->ack = 1; - } - - tcph->rst = 1; - tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, - niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)tcph - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set_noref(nskb, skb_dst(oldskb)); - - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) - goto free_nskb; - - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - - /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) - goto free_nskb; - - nf_ct_attach(nskb, oldskb); - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - /* If we use ip_local_out for bridged traffic, the MAC source on - * the RST will be ours, instead of the destination's. This confuses - * some routers/firewalls, and they drop the packet. So we need to - * build the eth header using the original destination's MAC as the - * source, and send the RST packet directly. - */ - if (oldskb->nf_bridge) { - struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) - goto free_nskb; - dev_queue_xmit(nskb); - } else -#endif - ip_local_out(nskb); - - return; - - free_nskb: - kfree_skb(nskb); -} - +void nf_send_reset(struct sk_buff *oldskb, int hook); #endif /* _IPV4_NF_REJECT_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 345242a79db6..4c019d5c3f57 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,8 +61,13 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. +config NF_REJECT_IPV4 + tristate "IPv4 packet rejection" + default m if NETFILTER_ADVANCED=n + config NFT_REJECT_IPV4 depends on NF_TABLES_IPV4 + select NF_REJECT_IPV4 default NFT_REJECT tristate @@ -208,6 +213,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" depends on IP_NF_FILTER + select NF_REJECT_IPV4 default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 14488cc5fd2c..f4cef5af0969 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -23,6 +23,9 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o +# reject +obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c new file mode 100644 index 000000000000..b023b4eb1a96 --- /dev/null +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -0,0 +1,127 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +/* Send RST reply */ +void nf_send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _otcph, *tcph; + + /* IP header checks: fragment. */ + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + /* Check checksum */ + if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) + return; + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; + + skb_reserve(nskb, LL_MAX_HEADER); + + skb_reset_network_header(nskb); + niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph->version = 4; + niph->ihl = sizeof(struct iphdr) / 4; + niph->tos = 0; + niph->id = 0; + niph->frag_off = htons(IP_DF); + niph->protocol = IPPROTO_TCP; + niph->check = 0; + niph->saddr = oiph->daddr; + niph->daddr = oiph->saddr; + + skb_reset_transport_header(nskb); + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + memset(tcph, 0, sizeof(*tcph)); + tcph->source = oth->dest; + tcph->dest = oth->source; + tcph->doff = sizeof(struct tcphdr) / 4; + + if (oth->ack) + tcph->seq = oth->ack_seq; + else { + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - ip_hdrlen(oldskb) - + (oth->doff << 2)); + tcph->ack = 1; + } + + tcph->rst = 1; + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, + niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)tcph - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + /* ip_route_me_harder expects skb->dst to be set */ + skb_dst_set_noref(nskb, skb_dst(oldskb)); + + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); + + /* "Never happens" */ + if (nskb->len > dst_mtu(skb_dst(nskb))) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + /* If we use ip_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + goto free_nskb; + dev_queue_xmit(nskb); + } else +#endif + ip_local_out(nskb); + + return; + + free_nskb: + kfree_skb(nskb); +} +EXPORT_SYMBOL_GPL(nf_send_reset); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index bb1a40db7be1..6af874fc187f 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -40,8 +40,13 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. +config NF_REJECT_IPV6 + tristate "IPv6 packet rejection" + default m if NETFILTER_ADVANCED=n + config NFT_REJECT_IPV6 depends on NF_TABLES_IPV6 + select NF_REJECT_IPV6 default NFT_REJECT tristate @@ -208,6 +213,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_REJECT tristate "REJECT target support" depends on IP6_NF_FILTER + select NF_REJECT_IPV6 default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMPv6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 0f7e5b3f328d..fbb25f01143c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,6 +27,9 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o # logging obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o +# reject +obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c new file mode 100644 index 000000000000..5f5f0438d74d --- /dev/null +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -0,0 +1,163 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); + struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; + struct dst_entry *dst = NULL; + u8 proto; + __be16 frag_off; + struct flowi6 fl6; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + pr_debug("addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + pr_debug("Cannot get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { + pr_debug("proto(%d) != IPPROTO_TCP, " + "or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + pr_debug("RST is set\n"); + return; + } + + /* Check checksum. */ + if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { + pr_debug("TCP checksum is invalid\n"); + return; + } + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = oip6h->daddr; + fl6.daddr = oip6h->saddr; + fl6.fl6_sport = otcph.dest; + fl6.fl6_dport = otcph.source; + security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + net_dbg_ratelimited("cannot alloc skb\n"); + dst_release(dst); + return; + } + + skb_dst_set(nskb, dst); + + skb_reserve(nskb, hh_len + dst->header_len); + + skb_put(nskb, sizeof(struct ipv6hdr)); + skb_reset_network_header(nskb); + ip6h = ipv6_hdr(nskb); + ip6_flow_hdr(ip6h, tclass, 0); + ip6h->hop_limit = ip6_dst_hoplimit(dst); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->saddr = oip6h->daddr; + ip6h->daddr = oip6h->saddr; + + skb_reset_transport_header(nskb); + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, + &ipv6_hdr(nskb)->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); + + nf_ct_attach(nskb, oldskb); + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + /* If we use ip6_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + nskb->protocol = htons(ETH_P_IPV6); + ip6h->payload_len = htons(sizeof(struct tcphdr)); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + return; + dev_queue_xmit(nskb); + } else +#endif + ip6_local_out(nskb); +} +EXPORT_SYMBOL_GPL(nf_send_reset6); -- cgit v1.2.3 From 1109a90c01177e8f4a5fd95c5b685ad02f1fe9bb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Oct 2014 11:19:17 +0200 Subject: netfilter: use IS_ENABLED(CONFIG_BRIDGE_NETFILTER) In 34666d4 ("netfilter: bridge: move br_netfilter out of the core"), the bridge netfilter code has been modularized. Use IS_ENABLED instead of ifdef to cover the module case. Fixes: 34666d4 ("netfilter: bridge: move br_netfilter out of the core") Signed-off-by: Pablo Neira Ayuso --- net/core/skbuff.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/netfilter/ipt_REJECT.c | 2 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 2 +- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 2 +- net/netfilter/ipset/ip_set_hash_netiface.c | 4 ++-- net/netfilter/nf_log_common.c | 2 +- net/netfilter/nf_queue.c | 4 ++-- net/netfilter/nfnetlink_log.c | 8 ++++---- net/netfilter/nfnetlink_queue_core.c | 12 ++++++------ 10 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4be570a4ab21..7de3d679f3e0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -566,7 +566,7 @@ static void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif /* XXX: IS this still necessary? - JHS */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c8fa62476461..e35b71289156 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -516,7 +516,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); #endif diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 5b6e0df4ccff..8f48f5517e33 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -20,7 +20,7 @@ #include #include #include -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include #endif diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 76bd1aef257f..7e5ca6f2d0cd 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -50,7 +50,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 7b9a748c6bac..e70382e4dfb5 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -40,7 +40,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 03cdb69ac9bf..35dd35873442 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -237,7 +237,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct nf_bridge_info *nf_bridge = skb->nf_bridge; if (!nf_bridge) @@ -474,7 +474,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip, e.cidr); if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct nf_bridge_info *nf_bridge = skb->nf_bridge; if (!nf_bridge) diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index eeb8ef4ff1a3..a2233e77cf39 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -158,7 +158,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge) { const struct net_device *physindev; const struct net_device *physoutdev; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5d24b1fdb593..4c8b68e5fa16 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -52,7 +52,7 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(entry->indev); if (entry->outdev) dev_put(entry->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; @@ -77,7 +77,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(entry->indev); if (entry->outdev) dev_hold(entry->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; struct net_device *physdev; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index a11c5ff2f720..b1e3a0579416 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -36,7 +36,7 @@ #include -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif @@ -429,7 +429,7 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; if (indev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; @@ -460,7 +460,7 @@ __build_packet_message(struct nfnl_log_net *log, } if (outdev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; @@ -640,7 +640,7 @@ nfulnl_log_packet(struct net *net, + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 108120f216b1..a82077d9f59b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -36,7 +36,7 @@ #include -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif @@ -302,7 +302,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif @@ -380,7 +380,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, indev = entry->indev; if (indev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else @@ -410,7 +410,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if (outdev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else @@ -569,7 +569,7 @@ nf_queue_entry_dup(struct nf_queue_entry *e) return NULL; } -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. @@ -763,7 +763,7 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) if (entry->outdev) if (entry->outdev->ifindex == ifindex) return 1; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { if (entry->skb->nf_bridge->physindev && entry->skb->nf_bridge->physindev->ifindex == ifindex) -- cgit v1.2.3 From 1b1bc49c0fc0501bf0d1366a2a5e5c1f8dcf9cb1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Oct 2014 13:53:20 +0200 Subject: netfilter: nf_tables: wait for call_rcu completion on module removal Make sure the objects have been released before the nf_tables modules is removed. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 19e79f0d9ad2..556a0dfa4abc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4163,6 +4163,7 @@ static void __exit nf_tables_module_exit(void) { unregister_pernet_subsys(&nf_tables_net_ops); nfnetlink_subsys_unregister(&nf_tables_subsys); + rcu_barrier(); nf_tables_core_module_exit(); kfree(info); } -- cgit v1.2.3 From 756c1b1a7f20a42a559b40b3b77db5afcbb719d6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 17 Jun 2014 21:18:44 +0200 Subject: netfilter: nft_compat: remove incomplete 32/64 bits arch compat code This code was based on the wrong asumption that you can probe based on the match/target private size that we get from userspace. This doesn't work at all when you have to dump the info back to userspace since you don't know what word size the userspace utility is using. Currently, the extensions that require arch compat are limit match and the ebt_mark match/target. The standard targets are not used by the nft-xt compat layer, so they are not affected. We can work around this limitation with a new revision that uses arch agnostic types. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 116 ++++++--------------------------------------- 1 file changed, 15 insertions(+), 101 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1840989092ed..7e2683c8a44a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -101,26 +101,12 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, static void target_compat_from_user(struct xt_target *t, void *in, void *out) { -#ifdef CONFIG_COMPAT - if (t->compat_from_user) { - int pad; - - t->compat_from_user(out, in); - pad = XT_ALIGN(t->targetsize) - t->targetsize; - if (pad > 0) - memset(out + t->targetsize, 0, pad); - } else -#endif - memcpy(out, in, XT_ALIGN(t->targetsize)); -} + int pad; -static inline int nft_compat_target_offset(struct xt_target *target) -{ -#ifdef CONFIG_COMPAT - return xt_compat_target_offset(target); -#else - return 0; -#endif + memcpy(out, in, t->targetsize); + pad = XT_ALIGN(t->targetsize) - t->targetsize; + if (pad > 0) + memset(out + t->targetsize, 0, pad); } static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { @@ -208,34 +194,6 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) module_put(target->me); } -static int -target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in) -{ - int ret; - -#ifdef CONFIG_COMPAT - if (t->compat_to_user) { - mm_segment_t old_fs; - void *out; - - out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC); - if (out == NULL) - return -ENOMEM; - - /* We want to reuse existing compat_to_user */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - t->compat_to_user(out, in); - set_fs(old_fs); - ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out); - kfree(out); - } else -#endif - ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in); - - return ret; -} - static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct xt_target *target = expr->ops->data; @@ -243,7 +201,7 @@ static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || - target_dump_info(skb, target, info)) + nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(target->targetsize), info)) goto nla_put_failure; return 0; @@ -341,17 +299,12 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, static void match_compat_from_user(struct xt_match *m, void *in, void *out) { -#ifdef CONFIG_COMPAT - if (m->compat_from_user) { - int pad; - - m->compat_from_user(out, in); - pad = XT_ALIGN(m->matchsize) - m->matchsize; - if (pad > 0) - memset(out + m->matchsize, 0, pad); - } else -#endif - memcpy(out, in, XT_ALIGN(m->matchsize)); + int pad; + + memcpy(out, in, m->matchsize); + pad = XT_ALIGN(m->matchsize) - m->matchsize; + if (pad > 0) + memset(out + m->matchsize, 0, pad); } static int @@ -404,43 +357,6 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) module_put(match->me); } -static int -match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in) -{ - int ret; - -#ifdef CONFIG_COMPAT - if (m->compat_to_user) { - mm_segment_t old_fs; - void *out; - - out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC); - if (out == NULL) - return -ENOMEM; - - /* We want to reuse existing compat_to_user */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - m->compat_to_user(out, in); - set_fs(old_fs); - ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out); - kfree(out); - } else -#endif - ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in); - - return ret; -} - -static inline int nft_compat_match_offset(struct xt_match *match) -{ -#ifdef CONFIG_COMPAT - return xt_compat_match_offset(match); -#else - return 0; -#endif -} - static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) { void *info = nft_expr_priv(expr); @@ -448,7 +364,7 @@ static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || - match_dump_info(skb, match, info)) + nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(match->matchsize), info)) goto nla_put_failure; return 0; @@ -643,8 +559,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-ENOMEM); nft_match->ops.type = &nft_match_type; - nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) + - nft_compat_match_offset(match)); + nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); nft_match->ops.eval = nft_match_eval; nft_match->ops.init = nft_match_init; nft_match->ops.destroy = nft_match_destroy; @@ -714,8 +629,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-ENOMEM); nft_target->ops.type = &nft_target_type; - nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) + - nft_compat_target_offset(target)); + nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.eval = nft_target_eval; nft_target->ops.init = nft_target_init; nft_target->ops.destroy = nft_target_destroy; -- cgit v1.2.3 From 36d2af5998258344993dd43729997a7a3baa9d99 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Oct 2014 20:34:37 +0200 Subject: netfilter: nf_tables: allow to filter from prerouting and postrouting This allows us to emulate the NAT table in ebtables, which is actually a plain filter chain that hooks at prerouting, output and postrouting. Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_tables_bridge.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 5bcc0d8b31f2..da17a5eab8b4 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -34,9 +34,11 @@ static struct nft_af_info nft_af_bridge __read_mostly = { .owner = THIS_MODULE, .nops = 1, .hooks = { + [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, [NF_BR_LOCAL_IN] = nft_do_chain_bridge, [NF_BR_FORWARD] = nft_do_chain_bridge, [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, + [NF_BR_POST_ROUTING] = nft_do_chain_bridge, }, }; -- cgit v1.2.3 From 4b7fd5d97ee6e599247b4a55122ca6ba80c8148d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Oct 2014 11:13:21 +0200 Subject: netfilter: explicit module dependency between br_netfilter and physdev You can use physdev to match the physical interface enslaved to the bridge device. This information is stored in skb->nf_bridge and it is set up by br_netfilter. So, this is only available when iptables is used from the bridge netfilter path. Since 34666d4 ("netfilter: bridge: move br_netfilter out of the core"), the br_netfilter code is modular. To reduce the impact of this change, we can autoload the br_netfilter if the physdev match is used since we assume that the users need br_netfilter in place. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/br_netfilter.h | 6 ++++++ net/bridge/br_netfilter.c | 5 +++++ net/netfilter/xt_physdev.c | 3 +++ 3 files changed, 14 insertions(+) create mode 100644 include/net/netfilter/br_netfilter.h (limited to 'net') diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h new file mode 100644 index 000000000000..2aa6048a55c1 --- /dev/null +++ b/include/net/netfilter/br_netfilter.h @@ -0,0 +1,6 @@ +#ifndef _BR_NETFILTER_H_ +#define _BR_NETFILTER_H_ + +void br_netfilter_enable(void); + +#endif /* _BR_NETFILTER_H_ */ diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 97e43937aaca..fa1270cc5086 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -856,6 +856,11 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, return NF_ACCEPT; } +void br_netfilter_enable(void) +{ +} +EXPORT_SYMBOL_GPL(br_netfilter_enable); + /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static struct nf_hook_ops br_nf_ops[] __read_mostly = { diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d7ca16b8b8df..f440f57a452f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -13,6 +13,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer "); @@ -87,6 +88,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; + br_netfilter_enable(); + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return -EINVAL; -- cgit v1.2.3 From 8da4cc1b10c1aeba090d1d862b17174e4dbd50a4 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Fri, 3 Oct 2014 14:13:36 +0200 Subject: netfilter: nft_masq: register/unregister notifiers on module init/exit We have to register the notifiers in the masquerade expression from the the module _init and _exit path. This fixes crashes when removing the masquerade rule with no ipt_MASQUERADE support in place (which was masking the problem). Fixes: 9ba1f72 ("netfilter: nf_tables: add new nft_masq expression") Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_masq_ipv4.c | 34 +++++++++++----------------------- net/ipv6/netfilter/nft_masq_ipv6.c | 34 +++++++++++----------------------- 2 files changed, 22 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 6ea1d207b6a5..1c636d6b5b50 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -32,33 +32,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = verdict; } -static int nft_masq_ipv4_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - int err; - - err = nft_masq_init(ctx, expr, tb); - if (err < 0) - return err; - - nf_nat_masquerade_ipv4_register_notifier(); - return 0; -} - -static void nft_masq_ipv4_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - nf_nat_masquerade_ipv4_unregister_notifier(); -} - static struct nft_expr_type nft_masq_ipv4_type; static const struct nft_expr_ops nft_masq_ipv4_ops = { .type = &nft_masq_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), .eval = nft_masq_ipv4_eval, - .init = nft_masq_ipv4_init, - .destroy = nft_masq_ipv4_destroy, + .init = nft_masq_init, .dump = nft_masq_dump, }; @@ -73,12 +52,21 @@ static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { static int __init nft_masq_ipv4_module_init(void) { - return nft_register_expr(&nft_masq_ipv4_type); + int ret; + + ret = nft_register_expr(&nft_masq_ipv4_type); + if (ret < 0) + return ret; + + nf_nat_masquerade_ipv4_register_notifier(); + + return ret; } static void __exit nft_masq_ipv4_module_exit(void) { nft_unregister_expr(&nft_masq_ipv4_type); + nf_nat_masquerade_ipv4_unregister_notifier(); } module_init(nft_masq_ipv4_module_init); diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 4e51334ef6b7..556262f40761 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -32,33 +32,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = verdict; } -static int nft_masq_ipv6_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - int err; - - err = nft_masq_init(ctx, expr, tb); - if (err < 0) - return err; - - nf_nat_masquerade_ipv6_register_notifier(); - return 0; -} - -static void nft_masq_ipv6_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - nf_nat_masquerade_ipv6_unregister_notifier(); -} - static struct nft_expr_type nft_masq_ipv6_type; static const struct nft_expr_ops nft_masq_ipv6_ops = { .type = &nft_masq_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), .eval = nft_masq_ipv6_eval, - .init = nft_masq_ipv6_init, - .destroy = nft_masq_ipv6_destroy, + .init = nft_masq_init, .dump = nft_masq_dump, }; @@ -73,12 +52,21 @@ static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { static int __init nft_masq_ipv6_module_init(void) { - return nft_register_expr(&nft_masq_ipv6_type); + int ret; + + ret = nft_register_expr(&nft_masq_ipv6_type); + if (ret < 0) + return ret; + + nf_nat_masquerade_ipv6_register_notifier(); + + return ret; } static void __exit nft_masq_ipv6_module_exit(void) { nft_unregister_expr(&nft_masq_ipv6_type); + nf_nat_masquerade_ipv6_unregister_notifier(); } module_init(nft_masq_ipv6_module_init); -- cgit v1.2.3 From 5772e9a3463b264cee5a4e73ef586ad482d7ba48 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Oct 2014 22:35:59 +0200 Subject: qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE Based on DaveM's recent API work on dev_hard_start_xmit(), that allows sending/processing an entire skb list. This patch implements qdisc bulk dequeue, by allowing multiple packets to be dequeued in dequeue_skb(). The optimization principle for this is two fold, (1) to amortize locking cost and (2) avoid expensive tailptr update for notifying HW. (1) Several packets are dequeued while holding the qdisc root_lock, amortizing locking cost over several packet. The dequeued SKB list is processed under the TXQ lock in dev_hard_start_xmit(), thus also amortizing the cost of the TXQ lock. (2) Further more, dev_hard_start_xmit() will utilize the skb->xmit_more API to delay HW tailptr update, which also reduces the cost per packet. One restriction of the new API is that every SKB must belong to the same TXQ. This patch takes the easy way out, by restricting bulk dequeue to qdisc's with the TCQ_F_ONETXQUEUE flag, that specifies the qdisc only have attached a single TXQ. Some detail about the flow; dev_hard_start_xmit() will process the skb list, and transmit packets individually towards the driver (see xmit_one()). In case the driver stops midway in the list, the remaining skb list is returned by dev_hard_start_xmit(). In sch_direct_xmit() this returned list is requeued by dev_requeue_skb(). To avoid overshooting the HW limits, which results in requeuing, the patch limits the amount of bytes dequeued, based on the drivers BQL limits. In-effect bulking will only happen for BQL enabled drivers. Small amounts for extra HoL blocking (2x MTU/0.24ms) were measured at 100Mbit/s, with bulking 8 packets, but the oscillating nature of the measurement indicate something, like sched latency might be causing this effect. More comparisons show, that this oscillation goes away occationally. Thus, we disregard this artifact completely and remove any "magic" bulking limit. For now, as a conservative approach, stop bulking when seeing TSO and segmented GSO packets. They already benefit from bulking on their own. A followup patch add this, to allow easier bisect-ability for finding regressions. Jointed work with Hannes, Daniel and Florian. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 16 ++++++++++++++++ net/sched/sch_generic.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f12669819d1a..d17ed6fb2f70 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -119,6 +120,21 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) qdisc->__state &= ~__QDISC___STATE_RUNNING; } +static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) +{ + return qdisc->flags & TCQ_F_ONETXQUEUE; +} + +static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) +{ +#ifdef CONFIG_BQL + /* Non-BQL migrated drivers will return 0, too. */ + return dql_avail(&txq->dql); +#else + return 0; +#endif +} + static inline bool qdisc_is_throttled(const struct Qdisc *qdisc) { return test_bit(__QDISC_STATE_THROTTLED, &qdisc->state) ? true : false; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7c8e5d73d433..c2e87e63b832 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -56,6 +56,41 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } +static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, + struct sk_buff *head_skb, + int bytelimit) +{ + struct sk_buff *skb, *tail_skb = head_skb; + + while (bytelimit > 0) { + /* For now, don't bulk dequeue GSO (or GSO segmented) pkts */ + if (tail_skb->next || skb_is_gso(tail_skb)) + break; + + skb = q->dequeue(q); + if (!skb) + break; + + bytelimit -= skb->len; /* covers GSO len */ + skb = validate_xmit_skb(skb, qdisc_dev(q)); + if (!skb) + break; + + /* "skb" can be a skb list after validate call above + * (GSO segmented), but it is okay to append it to + * current tail_skb->next, because next round will exit + * in-case "tail_skb->next" is a skb list. + */ + tail_skb->next = skb; + tail_skb = skb; + } + + return head_skb; +} + +/* Note that dequeue_skb can possibly return a SKB list (via skb->next). + * A requeued skb (via q->gso_skb) can also be a SKB list. + */ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { struct sk_buff *skb = q->gso_skb; @@ -70,10 +105,17 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) } else skb = NULL; } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { + if (!(q->flags & TCQ_F_ONETXQUEUE) || + !netif_xmit_frozen_or_stopped(txq)) { + int bytelimit = qdisc_avail_bulklimit(txq); + skb = q->dequeue(q); - if (skb) + if (skb) { + bytelimit -= skb->len; skb = validate_xmit_skb(skb, qdisc_dev(q)); + } + if (skb && qdisc_may_bulk(q)) + skb = try_bulk_dequeue_skb(q, skb, bytelimit); } } -- cgit v1.2.3 From 808e7ac0bdef31204184904f6b3ea356a30a9ed5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Oct 2014 22:36:09 +0200 Subject: qdisc: dequeue bulking also pickup GSO/TSO packets The TSO and GSO segmented packets already benefit from bulking on their own. The TSO packets have always taken advantage of the only updating the tailptr once for a large packet. The GSO segmented packets have recently taken advantage of bulking xmit_more API, via merge commit 53fda7f7f9e8 ("Merge branch 'xmit_list'"), specifically via commit 7f2e870f2a4 ("net: Move main gso loop out of dev_hard_start_xmit() into helper.") allowing qdisc requeue of remaining list. And via commit ce93718fb7cd ("net: Don't keep around original SKB when we software segment GSO frames."). This patch allow further bulking of TSO/GSO packets together, when dequeueing from the qdisc. Testing: Measuring HoL (Head-of-Line) blocking for TSO and GSO, with netperf-wrapper. Bulking several TSO show no performance regressions (requeues were in the area 32 requeues/sec). Bulking several GSOs does show small regression or very small improvement (requeues were in the area 8000 requeues/sec). Using ixgbe 10Gbit/s with GSO bulking, we can measure some additional latency. Base-case, which is "normal" GSO bulking, sees varying high-prio queue delay between 0.38ms to 0.47ms. Bulking several GSOs together, result in a stable high-prio queue delay of 0.50ms. Using igb at 100Mbit/s with GSO bulking, shows an improvement. Base-case sees varying high-prio queue delay between 2.23ms to 2.35ms Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c2e87e63b832..797ebef73642 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -63,10 +63,6 @@ static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, *tail_skb = head_skb; while (bytelimit > 0) { - /* For now, don't bulk dequeue GSO (or GSO segmented) pkts */ - if (tail_skb->next || skb_is_gso(tail_skb)) - break; - skb = q->dequeue(q); if (!skb) break; @@ -76,11 +72,9 @@ static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, if (!skb) break; - /* "skb" can be a skb list after validate call above - * (GSO segmented), but it is okay to append it to - * current tail_skb->next, because next round will exit - * in-case "tail_skb->next" is a skb list. - */ + while (tail_skb->next) /* GSO list goto tail */ + tail_skb = tail_skb->next; + tail_skb->next = skb; tail_skb = skb; } -- cgit v1.2.3 From 310886dd5fa3606d9325b10caf7c8ba5e9f9ab03 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Wed, 1 Oct 2014 18:49:52 -0300 Subject: net/rds: call rds_conn_drop instead of open code it at rds_connect_complete Signed-off-by: Herton R. Krzesinski Signed-off-by: David S. Miller --- net/rds/threads.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/threads.c b/net/rds/threads.c index 65eaefcab241..dc2402e871fd 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -78,8 +78,7 @@ void rds_connect_complete(struct rds_connection *conn) "current state is %d\n", __func__, atomic_read(&conn->c_state)); - atomic_set(&conn->c_state, RDS_CONN_ERROR); - queue_work(rds_wq, &conn->c_down_w); + rds_conn_drop(conn); return; } -- cgit v1.2.3 From eb74cc97b830c1e438dc1d6b049f17bdb2b9aae5 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Wed, 1 Oct 2014 18:49:53 -0300 Subject: net/rds: do proper house keeping if connection fails in rds_tcp_conn_connect I see two problems if we consider the sock->ops->connect attempt to fail in rds_tcp_conn_connect. The first issue is that for example we don't remove the previously added rds_tcp_connection item to rds_tcp_tc_list at rds_tcp_set_callbacks, which means that on a next reconnect attempt for the same rds_connection, when rds_tcp_conn_connect is called we can again call rds_tcp_set_callbacks, resulting in duplicated items on rds_tcp_tc_list, leading to list corruption: to avoid this just make sure we call properly rds_tcp_restore_callbacks before we exit. The second issue is that we should also release the sock properly, by setting sock = NULL only if we are returning without error. Signed-off-by: Herton R. Krzesinski Signed-off-by: David S. Miller --- net/rds/tcp_connect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index a65ee78db0c5..f9f564a6c960 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -106,11 +106,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rds_tcp_set_callbacks(sock, conn); ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), O_NONBLOCK); - sock = NULL; rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; + if (ret == 0) + sock = NULL; + else + rds_tcp_restore_callbacks(sock, conn->c_transport_data); out: if (sock) -- cgit v1.2.3 From 593cbb3ec6a3f2424966832727f394b1696d0d72 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Wed, 1 Oct 2014 18:49:54 -0300 Subject: net/rds: fix possible double free on sock tear down I got a report of a double free happening at RDS slab cache. One suspicion was that may be somewhere we were doing a sock_hold/sock_put on an already freed sock. Thus after providing a kernel with the following change: static inline void sock_hold(struct sock *sk) { - atomic_inc(&sk->sk_refcnt); + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + WARN(1, "Trying to hold sock already gone: %p (family: %hd)\n", + sk, sk->sk_family); } The warning successfuly triggered: Trying to hold sock already gone: ffff81f6dda61280 (family: 21) WARNING: at include/net/sock.h:350 sock_hold() Call Trace: [] :rds:rds_send_remove_from_sock+0xf0/0x21b [] :rds:rds_send_drop_acked+0xbf/0xcf [] :rds_rdma:rds_ib_recv_tasklet_fn+0x256/0x2dc [] tasklet_action+0x8f/0x12b [] __do_softirq+0x89/0x133 [] call_softirq+0x1c/0x28 [] do_softirq+0x2c/0x7d [] do_IRQ+0xee/0xf7 [] ret_from_intr+0x0/0xa Looking at the call chain above, the only way I think this would be possible is if somewhere we already released the same socket->sock which is assigned to the rds_message at rds_send_remove_from_sock. Which seems only possible to happen after the tear down done on rds_release. rds_release properly calls rds_send_drop_to to drop the socket from any rds_message, and some proper synchronization is in place to avoid race with rds_send_drop_acked/rds_send_remove_from_sock. However, I still see a very narrow window where it may be possible we touch a sock already released: when rds_release races with rds_send_drop_acked, we check RDS_MSG_ON_CONN to avoid cleanup on the same rds_message, but in this specific case we don't clear rm->m_rs. In this case, it seems we could then go on at rds_send_drop_to and after it returns, the sock is freed by last sock_put on rds_release, with concurrently we being at rds_send_remove_from_sock; then at some point in the loop at rds_send_remove_from_sock we process an rds_message which didn't have rm->m_rs unset for a freed sock, and a possible sock_hold on an sock already gone at rds_release happens. This hopefully address the described condition above and avoids a double free on "second last" sock_put. In addition, I removed the comment about socket destruction on top of rds_send_drop_acked: we call rds_send_drop_to in rds_release and we should have things properly serialized there, thus I can't see the comment being accurate there. Signed-off-by: Herton R. Krzesinski Signed-off-by: David S. Miller --- net/rds/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 23718160d71e..0a64541020b0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -593,8 +593,11 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; - sock_hold(rds_rs_to_sk(rs)); + if (rs) + sock_hold(rds_rs_to_sk(rs)); } + if (!rs) + goto unlock_and_drop; spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { @@ -638,9 +641,6 @@ unlock_and_drop: * queue. This means that in the TCP case, the message may not have been * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. - * - * XXX It's not clear to me how this is safely serialized with socket - * destruction. Maybe it should bail if it sees SOCK_DEAD. */ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked) @@ -711,6 +711,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&conn->c_lock, flags); + spin_lock_irqsave(&rm->m_rs_lock, flags); + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); -- cgit v1.2.3 From 55a93b3ea780908b7d1b3a8cf1976223a9268d78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Oct 2014 15:31:07 -0700 Subject: qdisc: validate skb without holding lock Validation of skb can be pretty expensive : GSO segmentation and/or checksum computations. We can do this without holding qdisc lock, so that other cpus can queue additional packets. Trick is that requeued packets were already validated, so we carry a boolean so that sch_direct_xmit() can validate a fresh skb list, or directly use an old one. Tested on 40Gb NIC (8 TX queues) and 200 concurrent flows, 48 threads host. Turning TSO on or off had no effect on throughput, only few more cpu cycles. Lock contention on qdisc lock disappeared. Same if disabling TX checksum offload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/net/pkt_sched.h | 2 +- net/core/dev.c | 29 +++++++++++++++++++--- net/sched/sch_generic.c | 61 ++++++++++++++++++++++------------------------- 4 files changed, 56 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9b7fbacb6296..910fb17ad148 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2821,7 +2821,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); -struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 8bbe626e9ece..e4b3c828c1c2 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -99,7 +99,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock); + spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); diff --git a/net/core/dev.c b/net/core/dev.c index e55c546717d4..1a90530f83ff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2655,7 +2655,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur return skb; } -struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2720,6 +2720,30 @@ out_null: return NULL; } +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *next, *head = NULL, *tail; + + while (skb) { + next = skb->next; + skb->next = NULL; + skb = validate_xmit_skb(skb, dev); + if (skb) { + struct sk_buff *end = skb; + + while (end->next) + end = end->next; + if (!head) + head = skb; + else + tail->next = skb; + tail = end; + } + skb = next; + } + return head; +} + static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2786,8 +2810,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); - skb = validate_xmit_skb(skb, dev); - if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 797ebef73642..2b349a4de3c8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -56,40 +56,34 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } -static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, - struct sk_buff *head_skb, - int bytelimit) +static void try_bulk_dequeue_skb(struct Qdisc *q, + struct sk_buff *skb, + const struct netdev_queue *txq) { - struct sk_buff *skb, *tail_skb = head_skb; + int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; while (bytelimit > 0) { - skb = q->dequeue(q); - if (!skb) - break; + struct sk_buff *nskb = q->dequeue(q); - bytelimit -= skb->len; /* covers GSO len */ - skb = validate_xmit_skb(skb, qdisc_dev(q)); - if (!skb) + if (!nskb) break; - while (tail_skb->next) /* GSO list goto tail */ - tail_skb = tail_skb->next; - - tail_skb->next = skb; - tail_skb = skb; + bytelimit -= nskb->len; /* covers GSO len */ + skb->next = nskb; + skb = nskb; } - - return head_skb; + skb->next = NULL; } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ -static inline struct sk_buff *dequeue_skb(struct Qdisc *q) +static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate) { struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + *validate = true; if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); @@ -98,21 +92,16 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) q->q.qlen--; } else skb = NULL; + /* skb in gso_skb were already validated */ + *validate = false; } else { if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { - int bytelimit = qdisc_avail_bulklimit(txq); - skb = q->dequeue(q); - if (skb) { - bytelimit -= skb->len; - skb = validate_xmit_skb(skb, qdisc_dev(q)); - } if (skb && qdisc_may_bulk(q)) - skb = try_bulk_dequeue_skb(q, skb, bytelimit); + try_bulk_dequeue_skb(q, skb, txq); } } - return skb; } @@ -156,19 +145,24 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, */ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock) + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; /* And release qdisc */ spin_unlock(root_lock); - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) - skb = dev_hard_start_xmit(skb, dev, txq, &ret); + /* Note that we validate skb (GSO, checksum, ...) outside of locks */ + if (validate) + skb = validate_xmit_skb_list(skb, dev); - HARD_TX_UNLOCK(dev, txq); + if (skb) { + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_stopped(txq)) + skb = dev_hard_start_xmit(skb, dev, txq, &ret); + HARD_TX_UNLOCK(dev, txq); + } spin_lock(root_lock); if (dev_xmit_complete(ret)) { @@ -217,9 +211,10 @@ static inline int qdisc_restart(struct Qdisc *q) struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; + bool validate; /* Dequeue packet */ - skb = dequeue_skb(q); + skb = dequeue_skb(q, &validate); if (unlikely(!skb)) return 0; @@ -229,7 +224,7 @@ static inline int qdisc_restart(struct Qdisc *q) dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock); + return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) -- cgit v1.2.3 From 01291202ed4ad548f9a7147d20425cb1d24f49a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Oct 2014 07:38:46 -0700 Subject: net: do not export skb_gro_receive() skb_gro_receive() is only called from tcp_gro_receive() which is not in a module. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe37309b6667..a0b312fa3047 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3244,7 +3244,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { -- cgit v1.2.3 From 7371e0221c7721a1486fef745abaa8ae84571621 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:07 -0700 Subject: ip_tunnel: Account for secondary encapsulation header in max_headroom When adjusting max_header for the tunnel interface based on egress device we need to account for any extra bytes in secondary encapsulation (e.g. FOU). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 2272de90c2d4..d9c9dc4ffeaf 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -759,7 +759,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) - + rt->dst.header_len; + + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; -- cgit v1.2.3 From efc98d08e1ec4fd131f794370b274dceaf32c958 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:08 -0700 Subject: fou: eliminate IPv4,v6 specific GRO functions This patch removes fou[46]_gro_receive and fou[46]_gro_complete functions. The v4 or v6 variants were chosen for the UDP offloads based on the address family of the socket this is not necessary or correct. Alternatively, this patch adds is_ipv6 to napi_gro_skb. This is set in udp6_gro_receive and unset in udp4_gro_receive. In fou_gro_receive the value is used to select the correct inet_offloads for the protocol of the outer IP header. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/ipv4/fou.c | 48 ++++++++--------------------------------------- net/ipv4/udp_offload.c | 1 + net/ipv6/udp_offload.c | 1 + 4 files changed, 13 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 910fb17ad148..22d54b9b700d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1886,6 +1886,9 @@ struct napi_gro_cb { /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; + /* Used in foo-over-udp, set in udp[46]_gro_receive */ + u8 is_ipv6:1; + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index dced89fbe480..7e2126a31f2e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -65,14 +65,15 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) } static struct sk_buff **fou_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - const struct net_offload **offloads) + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff **pp = NULL; u8 proto = NAPI_GRO_CB(skb)->proto; + const struct net_offload **offloads; rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out_unlock; @@ -85,14 +86,15 @@ out_unlock: return pp; } -static int fou_gro_complete(struct sk_buff *skb, int nhoff, - const struct net_offload **offloads) +static int fou_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; u8 proto = NAPI_GRO_CB(skb)->proto; int err = -ENOSYS; + const struct net_offload **offloads; rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; @@ -105,28 +107,6 @@ out_unlock: return err; } -static struct sk_buff **fou4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - return fou_gro_receive(head, skb, inet_offloads); -} - -static int fou4_gro_complete(struct sk_buff *skb, int nhoff) -{ - return fou_gro_complete(skb, nhoff, inet_offloads); -} - -static struct sk_buff **fou6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - return fou_gro_receive(head, skb, inet6_offloads); -} - -static int fou6_gro_complete(struct sk_buff *skb, int nhoff) -{ - return fou_gro_complete(skb, nhoff, inet6_offloads); -} - static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -199,20 +179,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - switch (cfg->udp_config.family) { - case AF_INET: - fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete; - break; - case AF_INET6: - fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete; - break; - default: - err = -EPFNOSUPPORT; - goto error; - } - + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; fou->udp_offloads.port = cfg->udp_config.local_udp_port; fou->udp_offloads.ipproto = cfg->protocol; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8c35f2c939ee..507310ef4b56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -334,6 +334,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, inet_gro_compute_pseudo); skip: + NAPI_GRO_CB(skb)->is_ipv6 = 0; return udp_gro_receive(head, skb, uh); flush: diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 8f96988c1db2..6b8f543f6ac6 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -140,6 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, ip6_gro_compute_pseudo); skip: + NAPI_GRO_CB(skb)->is_ipv6 = 1; return udp_gro_receive(head, skb, uh); flush: -- cgit v1.2.3 From 37dd0247797b168ad1cc7f5dbec825a1ee66535b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:09 -0700 Subject: gue: Receive side for Generic UDP Encapsulation This patch adds support receiving for GUE packets in the fou module. The fou module now supports direct foo-over-udp (no encapsulation header) and GUE. To support this a type parameter is added to the fou netlink parameters. For a GUE socket we define gue_udp_recv, gue_gro_receive, and gue_gro_complete to handle the specifics of the GUE protocol. Most of the code to manage and configure sockets is common with the fou. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/gue.h | 23 ++++++ include/uapi/linux/fou.h | 7 ++ net/ipv4/fou.c | 196 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 include/net/gue.h (limited to 'net') diff --git a/include/net/gue.h b/include/net/gue.h new file mode 100644 index 000000000000..b6c332788084 --- /dev/null +++ b/include/net/gue.h @@ -0,0 +1,23 @@ +#ifndef __NET_GUE_H +#define __NET_GUE_H + +struct guehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hlen:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + hlen:4; +#else +#error "Please fix " +#endif + __u8 next_hdr; + __u16 flags; + }; + __u32 word; + }; +}; + +#endif diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index e03376de453d..8df06894da23 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -13,6 +13,7 @@ enum { FOU_ATTR_PORT, /* u16 */ FOU_ATTR_AF, /* u8 */ FOU_ATTR_IPPROTO, /* u8 */ + FOU_ATTR_TYPE, /* u8 */ __FOU_ATTR_MAX, }; @@ -27,6 +28,12 @@ enum { __FOU_CMD_MAX, }; +enum { + FOU_ENCAP_UNSPEC, + FOU_ENCAP_DIRECT, + FOU_ENCAP_GUE, +}; + #define FOU_CMD_MAX (__FOU_CMD_MAX - 1) #endif /* _UAPI_LINUX_FOU_H */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 7e2126a31f2e..efa70ad44906 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ struct fou { }; struct fou_cfg { + u16 type; u8 protocol; struct udp_port_cfg udp_config; }; @@ -64,6 +66,41 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) sizeof(struct udphdr)); } +static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + size_t len; + struct guehdr *guehdr; + struct udphdr *uh; + + if (!fou) + return 1; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) + goto drop; + + uh = udp_hdr(skb); + guehdr = (struct guehdr *)&uh[1]; + + len += guehdr->hlen << 2; + if (!pskb_may_pull(skb, len)) + goto drop; + + if (guehdr->version != 0) + goto drop; + + if (guehdr->flags) { + /* No support yet */ + goto drop; + } + + return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); +drop: + kfree_skb(skb); + return 0; +} + static struct sk_buff **fou_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -107,6 +144,112 @@ out_unlock: return err; } +static struct sk_buff **gue_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + u8 proto; + struct guehdr *guehdr; + unsigned int hlen, guehlen; + unsigned int off; + int flush = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out; + } + + proto = guehdr->next_hdr; + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + hlen = off + guehlen; + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out_unlock; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + const struct guehdr *guehdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + guehdr2 = (struct guehdr *)(p->data + off); + + /* Compare base GUE header to be equal (covers + * hlen, version, next_hdr, and flags. + */ + if (guehdr->word != guehdr2->word) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Compare optional fields are the same. */ + if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], + guehdr->hlen << 2)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, guehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, guehdr, guehlen); + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gue_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload **offloads; + struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); + const struct net_offload *ops; + unsigned int guehlen; + u8 proto; + int err = -ENOENT; + + proto = guehdr->next_hdr; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + +out_unlock: + rcu_read_unlock(); + return err; +} + static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -142,6 +285,28 @@ static void fou_release(struct fou *fou) kfree(fou); } +static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->protocol = cfg->protocol; + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + return 0; +} + +static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = gue_udp_recv; + fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; + fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + + return 0; +} + static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { @@ -164,10 +329,24 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk = sock->sk; - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ - fou->protocol = cfg->protocol; - fou->port = cfg->udp_config.local_udp_port; - udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->port = cfg->udp_config.local_udp_port; + + /* Initial for fou type */ + switch (cfg->type) { + case FOU_ENCAP_DIRECT: + err = fou_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + case FOU_ENCAP_GUE: + err = gue_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + default: + err = -EINVAL; + goto error; + } udp_sk(sk)->encap_type = 1; udp_encap_enable(); @@ -179,11 +358,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; - fou->udp_offloads.ipproto = cfg->protocol; - if (cfg->udp_config.family == AF_INET) { err = udp_add_offload(&fou->udp_offloads); if (err) @@ -240,6 +414,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, [FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, @@ -267,6 +442,9 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_IPPROTO]) cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + if (info->attrs[FOU_ATTR_TYPE]) + cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); + return 0; } -- cgit v1.2.3 From bc1fc390e1728672b5b343b85185fcc1fe41043b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:10 -0700 Subject: ip_tunnel: Add GUE support This patch allows configuring IPIP, sit, and GRE tunnels to use GUE. This is very similar to fou excpet that we need to insert the GUE header in addition to the UDP header on transmit. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_tunnel.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 7c832afdfa94..280d9e092283 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -64,6 +64,7 @@ enum { enum tunnel_encap_types { TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_FOU, + TUNNEL_ENCAP_GUE, }; #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d9c9dc4ffeaf..0bb8e141eacc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,6 +56,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -495,6 +496,8 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e) return 0; case TUNNEL_ENCAP_FOU: return sizeof(struct udphdr); + case TUNNEL_ENCAP_GUE: + return sizeof(struct udphdr) + sizeof(struct guehdr); default: return -EINVAL; } @@ -546,6 +549,15 @@ static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, skb_reset_transport_header(skb); uh = udp_hdr(skb); + if (e->type == TUNNEL_ENCAP_GUE) { + struct guehdr *guehdr = (struct guehdr *)&uh[1]; + + guehdr->version = 0; + guehdr->hlen = 0; + guehdr->flags = 0; + guehdr->next_hdr = *protocol; + } + uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); @@ -565,6 +577,7 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, case TUNNEL_ENCAP_NONE: return 0; case TUNNEL_ENCAP_FOU: + case TUNNEL_ENCAP_GUE: return fou_build_header(skb, &t->encap, t->encap_hlen, protocol, fl4); default: -- cgit v1.2.3 From 3be07244b7337760a3269d56b2f4a63e72218648 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 2 Oct 2014 18:26:49 +0200 Subject: ip6_gre: fix flowi6_proto value in xmit path In xmit path, we build a flowi6 which will be used for the output route lookup. We are sending a GRE packet, neither IPv4 nor IPv6 encapsulated packet, thus the protocol should be IPPROTO_GRE. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Matthieu Ternisien d'Ouville Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f304471477dc..97299d76c1b0 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -782,7 +782,7 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv4_get_dsfield(iph); @@ -832,7 +832,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) -- cgit v1.2.3 From c8753d55afb436fd6a25c8bbe8d783f6dcf1c9f8 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Thu, 2 Oct 2014 10:00:43 -0700 Subject: net: Cleanup skb cloning by adding SKB_FCLONE_FREE SKB_FCLONE_UNAVAILABLE has overloaded meaning depending on type of skb. 1: If skb is allocated from head_cache, it indicates fclone is not available. 2: If skb is a companion fclone skb (allocated from fclone_cache), it indicates it is available to be used. To avoid confusion for case 2 above, this patch replaces SKB_FCLONE_UNAVAILABLE with SKB_FCLONE_FREE where appropriate. For fclone companion skbs, this indicates it is free for use. SKB_FCLONE_UNAVAILABLE will now simply indicate skb is from head_cache and cannot / will not have a companion fclone. Signed-off-by: Vijay Subramanian Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++--- net/core/skbuff.c | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7c5036d11feb..3a5ec7638627 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -339,9 +339,10 @@ struct skb_shared_info { enum { - SKB_FCLONE_UNAVAILABLE, - SKB_FCLONE_ORIG, - SKB_FCLONE_CLONE, + SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ + SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ + SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ + SKB_FCLONE_FREE, /* this companion fclone skb is available */ }; enum { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a0b312fa3047..28916e47f959 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; atomic_set(&fclones->fclone_ref, 1); - fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE; + fclones->skb2.fclone = SKB_FCLONE_FREE; fclones->skb2.pfmemalloc = pfmemalloc; } out: @@ -542,7 +542,7 @@ static void kfree_skbmem(struct sk_buff *skb) fclones = container_of(skb, struct sk_buff_fclones, skb2); /* Warning : We must perform the atomic_dec_and_test() before - * setting skb->fclone back to SKB_FCLONE_UNAVAILABLE, otherwise + * setting skb->fclone back to SKB_FCLONE_FREE, otherwise * skb_clone() could set clone_ref to 2 before our decrement. * Anyway, if we are going to free the structure, no need to * rewrite skb->fclone. @@ -553,7 +553,7 @@ static void kfree_skbmem(struct sk_buff *skb) /* The clone portion is available for * fast-cloning again. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; + skb->fclone = SKB_FCLONE_FREE; } break; } @@ -874,7 +874,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { + n->fclone == SKB_FCLONE_FREE) { n->fclone = SKB_FCLONE_CLONE; /* As our fastclone was free, clone_ref must be 1 at this point. * We could use atomic_inc() here, but it is faster -- cgit v1.2.3 From f7d6b96f345be7e0bd8f7651f7fe1efa5404c1e3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 2 Oct 2014 18:56:03 -0700 Subject: net: dsa: do not call phy_start_aneg Commit f7f1de51edbd ("net: dsa: start and stop the PHY state machine") add calls to phy_start() in dsa_slave_open() respectively phy_stop() in dsa_slave_close(). We also call phy_start_aneg() in dsa_slave_create(), and this call is messing up with the PHY state machine, since we basically start the auto-negotiation, and later on restart it when calling phy_start(). phy_start() does not currently handle the PHY_FORCING or PHY_AN states properly, but such a fix would be too invasive for this window. Fixes: f7f1de51edbd ("net: dsa: start and stop the PHY state machine") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 36953c84ff2d..8030489d9cbe 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -608,7 +608,6 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->phy->speed = 0; p->phy->duplex = 0; p->phy->advertising = p->phy->supported | ADVERTISED_Autoneg; - phy_start_aneg(p->phy); } return slave_dev; -- cgit v1.2.3 From 1e203c1a2c104c8f8030245d2afaa337a79b4375 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 2 Oct 2014 22:43:09 -0700 Subject: net: sched: suspicious RCU usage in qdisc_watchdog Suspicious RCU usage in qdisc_watchdog call needs to be done inside rcu_read_lock/rcu_read_unlock. And then Qdisc destroy operations need to ensure timer is cancelled before removing qdisc structure. [ 3992.191339] =============================== [ 3992.191340] [ INFO: suspicious RCU usage. ] [ 3992.191343] 3.17.0-rc6net-next+ #72 Not tainted [ 3992.191345] ------------------------------- [ 3992.191347] include/net/sch_generic.h:272 suspicious rcu_dereference_check() usage! [ 3992.191348] [ 3992.191348] other info that might help us debug this: [ 3992.191348] [ 3992.191351] [ 3992.191351] rcu_scheduler_active = 1, debug_locks = 1 [ 3992.191353] no locks held by swapper/1/0. [ 3992.191355] [ 3992.191355] stack backtrace: [ 3992.191358] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.17.0-rc6net-next+ #72 [ 3992.191360] Hardware name: /DZ77RE-75K, BIOS GAZ7711H.86A.0060.2012.1115.1750 11/15/2012 [ 3992.191362] 0000000000000001 ffff880235803e48 ffffffff8178f92c 0000000000000000 [ 3992.191366] ffff8802322224a0 ffff880235803e78 ffffffff810c9966 ffff8800a5fe3000 [ 3992.191370] ffff880235803f30 ffff8802359cd768 ffff8802359cd6e0 ffff880235803e98 [ 3992.191374] Call Trace: [ 3992.191376] [] dump_stack+0x4e/0x68 [ 3992.191387] [] lockdep_rcu_suspicious+0xe6/0x130 [ 3992.191392] [] qdisc_watchdog+0x8a/0xb0 [ 3992.191396] [] __run_hrtimer+0x72/0x420 [ 3992.191399] [] ? hrtimer_interrupt+0x7d/0x240 [ 3992.191403] [] ? tc_classify+0xc0/0xc0 [ 3992.191406] [] hrtimer_interrupt+0xff/0x240 [ 3992.191410] [] ? __atomic_notifier_call_chain+0x5/0x140 [ 3992.191415] [] local_apic_timer_interrupt+0x3b/0x60 [ 3992.191419] [] smp_apic_timer_interrupt+0x45/0x60 [ 3992.191422] [] apic_timer_interrupt+0x6f/0x80 [ 3992.191424] [] ? cpuidle_enter_state+0x73/0x2e0 [ 3992.191432] [] ? cpuidle_enter_state+0x6e/0x2e0 [ 3992.191437] [] cpuidle_enter+0x17/0x20 [ 3992.191441] [] cpu_startup_entry+0x3d1/0x4a0 [ 3992.191445] [] ? clockevents_config_and_register+0x26/0x30 [ 3992.191448] [] start_secondary+0x1b6/0x260 Fixes: b26b0d1e8b1 ("net: qdisc: use rcu prefix and silence sparse warnings") Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index aa8329508dba..c79a226cc25c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -578,8 +578,10 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); + rcu_read_lock(); qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); + rcu_read_unlock(); return HRTIMER_NORESTART; } -- cgit v1.2.3 From 34a419d4e20d6be5e0c4a3b27f6eface366a4836 Mon Sep 17 00:00:00 2001 From: Ignacy Gawędzki Date: Fri, 3 Oct 2014 15:44:48 +0200 Subject: ematch: Fix early ending of inverted containers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The result of a negated container has to be inverted before checking for early ending. This fixes my previous attempt (17c9c8232663a47f074b7452b9b034efda868ca7) to make inverted containers work correctly. Signed-off-by: Ignacy Gawędzki Signed-off-by: David S. Miller --- net/sched/ematch.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index ad57f4444b9c..f878fa16349a 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -526,9 +526,10 @@ pop_stack: match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); + if (tcf_em_is_inverted(cur_match)) + res = !res; + if (tcf_em_early_end(cur_match, res)) { - if (tcf_em_is_inverted(cur_match)) - res = !res; goto pop_stack; } else { match_idx++; -- cgit v1.2.3 From 96a20d9d7fff7068a2233b00379f0778a150bf86 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 3 Oct 2014 11:29:16 -0400 Subject: bridge: Add a default_pvid sysfs attribute This patch allows the user to set and retrieve default_pvid value. A new value can only be stored when vlan filtering is disabled. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/bridge/br_private.h | 2 ++ net/bridge/br_sysfs_br.c | 17 +++++++++++++++++ net/bridge/br_vlan.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index fe7463c2af9a..5a347eb1d139 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -299,6 +299,7 @@ struct net_bridge #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_enabled; __be16 vlan_proto; + u16 default_pvid; struct net_port_vlans __rcu *vlan_info; #endif }; @@ -605,6 +606,7 @@ void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); void br_vlan_init(struct net_bridge *br); +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index cb431c6016ee..4c97fc50fb70 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -725,6 +725,22 @@ static ssize_t vlan_protocol_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); + +static ssize_t default_pvid_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->default_pvid); +} + +static ssize_t default_pvid_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); +} +static DEVICE_ATTR_RW(default_pvid); #endif static struct attribute *bridge_attrs[] = { @@ -771,6 +787,7 @@ static struct attribute *bridge_attrs[] = { #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, + &dev_attr_default_pvid.attr, #endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3ba57fcdcd13..dfa7c9a7e193 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -499,9 +499,38 @@ err_filt: goto unlock; } +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) +{ + u16 pvid = val; + int err = 0; + + if (!val || val >= VLAN_VID_MASK) + return -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (pvid == br->default_pvid) + goto unlock; + + /* Only allow default pvid change when filtering is disabled */ + if (br->vlan_enabled) { + pr_info_once("Please disable vlan filtering to change default_pvid\n"); + err = -EPERM; + goto unlock; + } + + br->default_pvid = pvid; + +unlock: + rtnl_unlock(); + return err; +} + void br_vlan_init(struct net_bridge *br) { br->vlan_proto = htons(ETH_P_8021Q); + br->default_pvid = 1; } /* Must be protected by RTNL. -- cgit v1.2.3 From 3df6bf45ec008942f16f1814123c4bdebcf50741 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 3 Oct 2014 11:29:17 -0400 Subject: bridge: Simplify pvid checks. Currently, if the pvid is not set, we return an illegal vlan value even though the pvid value is set to 0. Since pvid of 0 is currently invalid, just return 0 instead. This makes the current and future checks simpler. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/bridge/br_private.h | 7 ++----- net/bridge/br_vlan.c | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5a347eb1d139..f671561b3053 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -643,11 +643,8 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) static inline u16 br_get_pvid(const struct net_port_vlans *v) { - /* Return just the VID if it is set, or VLAN_N_VID (invalid vid) if - * vid wasn't set - */ smp_rmb(); - return v->pvid ?: VLAN_N_VID; + return v->pvid; } static inline int br_vlan_enabled(struct net_bridge *br) @@ -746,7 +743,7 @@ static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) } static inline u16 br_get_pvid(const struct net_port_vlans *v) { - return VLAN_N_VID; /* Returns invalid vid */ + return 0; } static inline int br_vlan_enabled(struct net_bridge *br) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index dfa7c9a7e193..e11c9932e706 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -223,7 +223,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ - if (pvid == VLAN_N_VID) + if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged @@ -292,7 +292,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) if (!*vid) { *vid = br_get_pvid(v); - if (*vid == VLAN_N_VID) + if (!*vid) return false; return true; -- cgit v1.2.3 From 5be5a2df40f005ea7fb7e280e87bbbcfcf1c2fc0 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 3 Oct 2014 11:29:18 -0400 Subject: bridge: Add filtering support for default_pvid Currently when vlan filtering is turned on on the bridge, the bridge will drop all traffic untill the user configures the filter. This isn't very nice for ports that don't care about vlans and just want untagged traffic. A concept of a default_pvid was recently introduced. This patch adds filtering support for default_pvid. Now, ports that don't care about vlans and don't define there own filter will belong to the VLAN of the default_pvid and continue to receive untagged traffic. This filtering can be disabled by setting default_pvid to 0. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/bridge/br_device.c | 8 +++- net/bridge/br_if.c | 3 ++ net/bridge/br_private.h | 14 +++++- net/bridge/br_vlan.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 136 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 659cac15c0df..ffd379db5938 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -88,12 +88,17 @@ out: static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); + int err; br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!br->stats) return -ENOMEM; - return 0; + err = br_vlan_init(br); + if (err) + free_percpu(br->stats); + + return err; } static int br_dev_open(struct net_device *dev) @@ -389,5 +394,4 @@ void br_dev_setup(struct net_device *dev) br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); - br_vlan_init(br); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 7b7289ca2992..ed307db7a12b 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -500,6 +500,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); + if (nbp_vlan_init(p)) + netdev_err(dev, "failed to initialize vlan filtering on this port\n"); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f671561b3053..d8cbaa694227 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -605,12 +605,13 @@ bool br_vlan_find(struct net_bridge *br, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); -void br_vlan_init(struct net_bridge *br); +int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); bool nbp_vlan_find(struct net_bridge_port *port, u16 vid); +int nbp_vlan_init(struct net_bridge_port *port); static inline struct net_port_vlans *br_get_vlan_info( const struct net_bridge *br) @@ -643,6 +644,9 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) static inline u16 br_get_pvid(const struct net_port_vlans *v) { + if (!v) + return 0; + smp_rmb(); return v->pvid; } @@ -703,8 +707,9 @@ static inline void br_recalculate_fwd_mask(struct net_bridge *br) { } -static inline void br_vlan_init(struct net_bridge *br) +static inline int br_vlan_init(struct net_bridge *br) { + return 0; } static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) @@ -737,6 +742,11 @@ static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) return false; } +static inline int nbp_vlan_init(struct net_bridge_port *port) +{ + return 0; +} + static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) { return 0; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index e11c9932e706..150048fb99b0 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -499,12 +499,110 @@ err_filt: goto unlock; } +static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid) +{ + return pv && vid == pv->pvid && test_bit(vid, pv->untagged_bitmap); +} + +static void br_vlan_disable_default_pvid(struct net_bridge *br) +{ + struct net_bridge_port *p; + u16 pvid = br->default_pvid; + + /* Disable default_pvid on all ports where it is still + * configured. + */ + if (vlan_default_pvid(br_get_vlan_info(br), pvid)) + br_vlan_delete(br, pvid); + + list_for_each_entry(p, &br->port_list, list) { + if (vlan_default_pvid(nbp_get_vlan_info(p), pvid)) + nbp_vlan_delete(p, pvid); + } + + br->default_pvid = 0; +} + +static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +{ + struct net_bridge_port *p; + u16 old_pvid; + int err = 0; + unsigned long *changed; + + changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), + GFP_KERNEL); + if (!changed) + return -ENOMEM; + + old_pvid = br->default_pvid; + + /* Update default_pvid config only if we do not conflict with + * user configuration. + */ + if ((!old_pvid || vlan_default_pvid(br_get_vlan_info(br), old_pvid)) && + !br_vlan_find(br, pvid)) { + err = br_vlan_add(br, pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + if (err) + goto out; + br_vlan_delete(br, old_pvid); + set_bit(0, changed); + } + + list_for_each_entry(p, &br->port_list, list) { + /* Update default_pvid config only if we do not conflict with + * user configuration. + */ + if ((old_pvid && + !vlan_default_pvid(nbp_get_vlan_info(p), old_pvid)) || + nbp_vlan_find(p, pvid)) + continue; + + err = nbp_vlan_add(p, pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + if (err) + goto err_port; + nbp_vlan_delete(p, old_pvid); + set_bit(p->port_no, changed); + } + + br->default_pvid = pvid; + +out: + kfree(changed); + return err; + +err_port: + list_for_each_entry_continue_reverse(p, &br->port_list, list) { + if (!test_bit(p->port_no, changed)) + continue; + + if (old_pvid) + nbp_vlan_add(p, old_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + nbp_vlan_delete(p, pvid); + } + + if (test_bit(0, changed)) { + if (old_pvid) + br_vlan_add(br, old_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + br_vlan_delete(br, pvid); + } + goto out; +} + int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) { u16 pvid = val; int err = 0; - if (!val || val >= VLAN_VID_MASK) + if (val >= VLAN_VID_MASK) return -EINVAL; if (!rtnl_trylock()) @@ -520,17 +618,22 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) goto unlock; } - br->default_pvid = pvid; + if (!pvid) + br_vlan_disable_default_pvid(br); + else + err = __br_vlan_set_default_pvid(br, pvid); unlock: rtnl_unlock(); return err; } -void br_vlan_init(struct net_bridge *br) +int br_vlan_init(struct net_bridge *br) { br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; + return br_vlan_add(br, 1, + BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED); } /* Must be protected by RTNL. @@ -622,3 +725,12 @@ out: rcu_read_unlock(); return found; } + +int nbp_vlan_init(struct net_bridge_port *p) +{ + return p->br->default_pvid ? + nbp_vlan_add(p, p->br->default_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED) : + 0; +} -- cgit v1.2.3 From bdf6fa52f01b941d4a80372d56de465bdbbd1d23 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 3 Oct 2014 18:16:20 -0400 Subject: sctp: handle association restarts when the socket is closed. Currently association restarts do not take into consideration the state of the socket. When a restart happens, the current assocation simply transitions into established state. This creates a condition where a remote system, through a the restart procedure, may create a local association that is no way reachable by user. The conditions to trigger this are as follows: 1) Remote does not acknoledge some data causing data to remain outstanding. 2) Local application calls close() on the socket. Since data is still outstanding, the association is placed in SHUTDOWN_PENDING state. However, the socket is closed. 3) The remote tries to create a new association, triggering a restart on the local system. The association moves from SHUTDOWN_PENDING to ESTABLISHED. At this point, it is no longer reachable by any socket on the local system. This patch addresses the above situation by moving the newly ESTABLISHED association into SHUTDOWN-SENT state and bundling a SHUTDOWN after the COOKIE-ACK chunk. This way, the restarted associate immidiately enters the shutdown procedure and forces the termination of the unreachable association. Reported-by: David Laight Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/command.h | 2 +- net/sctp/sm_statefuns.c | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index f22538e68245..d4a20d00461c 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -115,7 +115,7 @@ typedef enum { * analysis of the state functions, but in reality just taken from * thin air in the hopes othat we don't trigger a kernel panic. */ -#define SCTP_MAX_NUM_COMMANDS 14 +#define SCTP_MAX_NUM_COMMANDS 20 typedef union { void *zero_all; /* Set to NULL to clear the entire union */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d3f1ea460c50..c8f606324134 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1775,9 +1775,22 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net, /* Update the content of current association. */ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); - sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, - SCTP_STATE(SCTP_STATE_ESTABLISHED)); - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + if (sctp_state(asoc, SHUTDOWN_PENDING) && + (sctp_sstate(asoc->base.sk, CLOSING) || + sock_flag(asoc->base.sk, SOCK_DEAD))) { + /* if were currently in SHUTDOWN_PENDING, but the socket + * has been closed by user, don't transition to ESTABLISHED. + * Instead trigger SHUTDOWN bundled with COOKIE_ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, + SCTP_ST_CHUNK(0), NULL, + commands); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + } return SCTP_DISPOSITION_CONSUME; nomem_ev: -- cgit v1.2.3 From 0b5e8b8eeae40bae6ad7c7e91c97c3c0d0e57882 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 3 Oct 2014 15:35:28 -0700 Subject: net: Add Geneve tunneling protocol driver This adds a device level support for Geneve -- Generic Network Virtualization Encapsulation. The protocol is documented at http://tools.ietf.org/html/draft-gross-geneve-01 Only protocol layer Geneve support is provided by this driver. Openvswitch can be used for configuring, set up and tear down functional Geneve tunnels. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/net/geneve.h | 91 ++++++++++++ include/net/ip_tunnels.h | 2 + net/ipv4/Kconfig | 14 ++ net/ipv4/Makefile | 1 + net/ipv4/geneve.c | 373 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 481 insertions(+) create mode 100644 include/net/geneve.h create mode 100644 net/ipv4/geneve.c (limited to 'net') diff --git a/include/net/geneve.h b/include/net/geneve.h new file mode 100644 index 000000000000..ce98865318bf --- /dev/null +++ b/include/net/geneve.h @@ -0,0 +1,91 @@ +#ifndef __NET_GENEVE_H +#define __NET_GENEVE_H 1 + +#include + +struct geneve_sock; + +typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); + +struct geneve_sock { + struct hlist_node hlist; + geneve_rcv_t *rcv; + void *rcv_data; + struct work_struct del_work; + struct socket *sock; + struct rcu_head rcu; + atomic_t refcnt; + struct udp_offload udp_offloads; +}; + +/* Geneve Header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |Ver| Opt Len |O|C| Rsvd. | Protocol Type | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Virtual Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Variable Length Options | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Option Header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Option Class | Type |R|R|R| Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Variable Option Data | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + +struct geneve_opt { + __be16 opt_class; + u8 type; +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 length:5; + u8 r3:1; + u8 r2:1; + u8 r1:1; +#else + u8 r1:1; + u8 r2:1; + u8 r3:1; + u8 length:5; +#endif + u8 opt_data[]; +}; + +#define GENEVE_CRIT_OPT_TYPE (1 << 7) + +struct genevehdr { +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 opt_len:6; + u8 ver:2; + u8 rsvd1:6; + u8 critical:1; + u8 oam:1; +#else + u8 ver:2; + u8 opt_len:6; + u8 oam:1; + u8 critical:1; + u8 rsvd1:6; +#endif + __be16 proto_type; + u8 vni[3]; + u8 rsvd2; + struct geneve_opt options[]; +}; + +#define GENEVE_VER 0 +#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6); + +void geneve_sock_release(struct geneve_sock *vs); + +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool xnet); +#endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 7f538ba6e267..a9ce1556d773 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -95,6 +95,8 @@ struct ip_tunnel { #define TUNNEL_VERSION __cpu_to_be16(0x40) #define TUNNEL_NO_KEY __cpu_to_be16(0x80) #define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) struct tnl_ptk_info { __be16 flags; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 69fb37854449..c2035447e649 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -453,6 +453,20 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config GENEVE + tristate "Generic Network Virtualization Encapsulation (Geneve)" + depends on INET + select NET_IP_TUNNEL + select NET_UDP_TUNNEL + ---help--- + This allows one to create Geneve virtual interfaces that provide + Layer 2 Networks over Layer 3 Networks. Geneve is often used + to tunnel virtual network infrastructure in virtualized environments. + For more information see: + http://tools.ietf.org/html/draft-gross-geneve-01 + + To compile this driver as a module, choose M here: the module + config TCP_CONG_CUBIC tristate "CUBIC TCP" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d8105787c199..518c04ed666e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c new file mode 100644 index 000000000000..f008c5515f48 --- /dev/null +++ b/net/ipv4/geneve.c @@ -0,0 +1,373 @@ +/* + * Geneve: Generic Network Virtualization Encapsulation + * + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#include +#endif + +#define PORT_HASH_BITS 8 +#define PORT_HASH_SIZE (1<sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; +} + +/* Find geneve socket based on network namespace and UDP port */ +static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port) +{ + struct geneve_sock *gs; + + hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { + if (inet_sk(gs->sock->sk)->inet_sport == port) + return gs; + } + + return NULL; +} + +static void geneve_build_header(struct genevehdr *geneveh, + __be16 tun_flags, u8 vni[3], + u8 options_len, u8 *options) +{ + geneveh->ver = GENEVE_VER; + geneveh->opt_len = options_len / 4; + geneveh->oam = !!(tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + geneveh->rsvd1 = 0; + memcpy(geneveh->vni, vni, 3); + geneveh->proto_type = htons(ETH_P_TEB); + geneveh->rsvd2 = 0; + + memcpy(geneveh->options, options, options_len); +} + +/* Transmit a fully formated Geneve frame. + * + * When calling this function. The skb->data should point + * to the geneve header which is fully formed. + * + * This function will add other UDP tunnel headers. + */ +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool xnet) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + + skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + return err; + + if (vlan_tx_tag_present(skb)) { + if (unlikely(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) { + err = -ENOMEM; + return err; + } + skb->vlan_tci = 0; + } + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + + return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, + tos, ttl, df, src_port, dst_port, xnet); +} +EXPORT_SYMBOL_GPL(geneve_xmit_skb); + +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +/* Callback from net/ipv4/udp.c to receive packets */ +static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + int opts_len; + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) + goto error; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + + if (unlikely(geneveh->ver != GENEVE_VER)) + goto error; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + goto error; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB))) + goto drop; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + gs->rcv(gs, skb); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Let the UDP layer deal with the skb */ + return 1; +} + +static void geneve_del_work(struct work_struct *work) +{ + struct geneve_sock *gs = container_of(work, struct geneve_sock, + del_work); + + udp_tunnel_sock_release(gs->sock); + kfree_rcu(gs, rcu); +} + +static struct socket *geneve_create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = INADDR_ANY; + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +/* Create new listen socket if needed */ +static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool ipv6) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + struct socket *sock; + struct udp_tunnel_sock_cfg tunnel_cfg; + + gs = kzalloc(sizeof(*gs), GFP_KERNEL); + if (!gs) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&gs->del_work, geneve_del_work); + + sock = geneve_create_sock(net, ipv6, port); + if (IS_ERR(sock)) { + kfree(gs); + return ERR_CAST(sock); + } + + gs->sock = sock; + atomic_set(&gs->refcnt, 1); + gs->rcv = rcv; + gs->rcv_data = data; + + /* Initialize the geneve udp offloads structure */ + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = NULL; + gs->udp_offloads.callbacks.gro_complete = NULL; + + spin_lock(&gn->sock_lock); + hlist_add_head_rcu(&gs->hlist, gs_head(net, port)); + geneve_notify_add_rx_port(gs); + spin_unlock(&gn->sock_lock); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = gs; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); + + return gs; +} + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) +{ + struct geneve_sock *gs; + + gs = geneve_socket_create(net, port, rcv, data, ipv6); + if (!IS_ERR(gs)) + return gs; + + if (no_share) /* Return error if sharing is not allowed. */ + return ERR_PTR(-EINVAL); + + gs = geneve_find_sock(net, port); + if (gs) { + if (gs->rcv == rcv) + atomic_inc(&gs->refcnt); + else + gs = ERR_PTR(-EBUSY); + } else { + gs = ERR_PTR(-EINVAL); + } + + return gs; +} +EXPORT_SYMBOL_GPL(geneve_sock_add); + +void geneve_sock_release(struct geneve_sock *gs) +{ + if (!atomic_dec_and_test(&gs->refcnt)) + return; + + queue_work(geneve_wq, &gs->del_work); +} +EXPORT_SYMBOL_GPL(geneve_sock_release); + +static __net_init int geneve_init_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + unsigned int h; + + spin_lock_init(&gn->sock_lock); + + for (h = 0; h < PORT_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&gn->sock_list[h]); + + return 0; +} + +static struct pernet_operations geneve_net_ops = { + .init = geneve_init_net, + .exit = NULL, + .id = &geneve_net_id, + .size = sizeof(struct geneve_net), +}; + +static int __init geneve_init_module(void) +{ + int rc; + + geneve_wq = alloc_workqueue("geneve", 0, 0); + if (!geneve_wq) + return -ENOMEM; + + rc = register_pernet_subsys(&geneve_net_ops); + if (rc) + return rc; + + pr_info("Geneve driver\n"); + + return 0; +} +late_initcall(geneve_init_module); + +static void __exit geneve_cleanup_module(void) +{ + destroy_workqueue(geneve_wq); +} +module_exit(geneve_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jesse Gross "); +MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); +MODULE_ALIAS_RTNL_LINK("geneve"); -- cgit v1.2.3 From 0714812134d7dcadeb7ecfbfeb18788aa7e1eaac Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:29 -0700 Subject: openvswitch: Eliminate memset() from flow_extract. As new protocols are added, the size of the flow key tends to increase although few protocols care about all of the fields. In order to optimize this for hashing and matching, OVS uses a variable length portion of the key. However, when fields are extracted from the packet we must still zero out the entire key. This is no longer necessary now that OVS implements masking. Any fields (or holes in the structure) which are not part of a given protocol will be by definition not part of the mask and zeroed out during lookup. Furthermore, since masking already uses variable length keys this zeroing operation automatically benefits as well. In principle, the only thing that needs to be done at this point is remove the memset() at the beginning of flow. However, some fields assume that they are initialized to zero, which now must be done explicitly. In addition, in the event of an error we must also zero out corresponding fields to signal that there is no valid data present. These increase the total amount of code but very little of it is executed in non-error situations. Removing the memset() reduces the profile of ovs_flow_extract() from 0.64% to 0.56% when tested with large packets on a 10G link. Suggested-by: Pravin Shelar Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 54 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 4010423f2831..913bdc1a83c0 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -462,6 +462,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * update skb->csum here. */ + key->eth.tci = 0; if (vlan_tx_tag_present(skb)) key->eth.tci = htons(skb->vlan_tci); else if (eth->h_proto == htons(ETH_P_8021Q)) @@ -482,6 +483,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) error = check_iphdr(skb); if (unlikely(error)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; @@ -503,8 +506,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } if (nh->frag_off & htons(IP_MF) || - skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { @@ -513,18 +518,25 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } + } else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { @@ -534,33 +546,44 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * them in 16-bit network byte order. */ key->tp.src = htons(icmp->type); key->tp.dst = htons(icmp->code); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } - } else if ((key->eth.type == htons(ETH_P_ARP) || - key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) { + } else if (key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) { struct arp_eth_header *arp; arp = (struct arp_eth_header *)skb_network_header(skb); - if (arp->ar_hrd == htons(ARPHRD_ETHER) - && arp->ar_pro == htons(ETH_P_IP) - && arp->ar_hln == ETH_ALEN - && arp->ar_pln == 4) { + if (arphdr_ok(skb) && + arp->ar_hrd == htons(ARPHRD_ETHER) && + arp->ar_pro == htons(ETH_P_IP) && + arp->ar_hln == ETH_ALEN && + arp->ar_pln == 4) { /* We only match on the lower 8 bits of the opcode. */ if (ntohs(arp->ar_op) <= 0xff) key->ip.proto = ntohs(arp->ar_op); + else + key->ip.proto = 0; + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); + } else { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); if (nh_len == -EINVAL) { skb->transport_header = skb->network_header; error = 0; @@ -582,24 +605,32 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { error = parse_icmpv6(skb, key, nh_len); if (error) return error; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } } @@ -615,13 +646,19 @@ int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - memset(key, 0, sizeof(*key)); if (tun_key) memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); + else + memset(&key->tun_key, 0, sizeof(key->tun_key)); key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; + key->ovs_flow_hash = 0; + key->recirc_id = 0; + + /* Flags are always used as part of stats */ + key->tp.flags = 0; return key_extract(skb, key); } @@ -632,7 +669,6 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, { int err; - memset(key, 0, sizeof(*key)); /* Extract metadata from netlink attributes. */ err = ovs_nla_get_flow_metadata(attr, key); if (err) -- cgit v1.2.3 From 67fa034194bf82a3d5ca841759d921297daa63ca Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:30 -0700 Subject: openvswitch: Add support for matching on OAM packets. Some tunnel formats have mechanisms for indicating that packets are OAM frames that should be handled specially (either as high priority or not forwarded beyond an endpoint). This provides support for allowing those types of packets to be matched. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/datapath.c | 1 + net/openvswitch/flow_netlink.c | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index f7fc507d82ab..7c06106f5af5 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -309,6 +309,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9e3a2fae6a8f..f6bd93d5f435 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -369,6 +369,7 @@ static size_t key_attr_size(void) + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index f4c8daa73965..22c855fa0bc2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -346,6 +346,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_TTL] = 1, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + [OVS_TUNNEL_KEY_ATTR_OAM] = 0, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -390,6 +391,9 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_CSUM: tun_flags |= TUNNEL_CSUM; break; + case OVS_TUNNEL_KEY_ATTR_OAM: + tun_flags |= TUNNEL_OAM; + break; default: return -EINVAL; } @@ -431,21 +435,24 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; nla_nest_end(skb, nla); -- cgit v1.2.3 From f0b128c1e2cc33ad104daf0f51a51e34f7763c5f Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:31 -0700 Subject: openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure. Currently, the flow information that is matched for tunnels and the tunnel data passed around with packets is the same. However, as additional information is added this is not necessarily desirable, as in the case of pointers. This adds a new structure for tunnel metadata which currently contains only the existing struct. This change is purely internal to the kernel since the current OVS_KEY_ATTR_IPV4_TUNNEL is simply a compressed version of OVS_KEY_ATTR_TUNNEL that is translated at flow setup. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/actions.c | 5 +++-- net/openvswitch/datapath.h | 2 +- net/openvswitch/flow.c | 6 +++--- net/openvswitch/flow.h | 30 +++++++++++++++++------------- net/openvswitch/flow_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/openvswitch/vport-gre.c | 16 +++++++++------- net/openvswitch/vport-vxlan.c | 10 +++++----- net/openvswitch/vport.c | 6 +++--- net/openvswitch/vport.h | 2 +- 10 files changed, 74 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7c06106f5af5..6753032832e2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -294,7 +294,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ - OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ #endif __OVS_KEY_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 6932a42e41a2..006886dbee36 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -590,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; - case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->egress_tun_key = nla_data(nested_attr); + case OVS_KEY_ATTR_TUNNEL_INFO: + OVS_CB(skb)->egress_tun_info = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -778,6 +778,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); this_cpu_inc(exec_actions_level); + OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index ac3f3df96961..974135439c5c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -102,8 +102,8 @@ struct datapath { */ struct ovs_skb_cb { struct sw_flow *flow; + struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; - struct ovs_key_ipv4_tunnel *egress_tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 913bdc1a83c0..2924cb340868 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -642,12 +642,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_key) - memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); + if (tun_info) + memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); else memset(&key->tun_key, 0, sizeof(key->tun_key)); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 0f5db4ec565d..fe5a71b81c1f 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,20 +49,24 @@ struct ovs_key_ipv4_tunnel { u8 ipv4_ttl; } __packed __aligned(4); /* Minimize padding. */ -static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) +struct ovs_tunnel_info { + struct ovs_key_ipv4_tunnel tunnel; +}; + +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be64 tun_id, __be16 tun_flags) { - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; + tun_info->tunnel.tun_id = tun_id; + tun_info->tunnel.ipv4_src = iph->saddr; + tun_info->tunnel.ipv4_dst = iph->daddr; + tun_info->tunnel.ipv4_tos = iph->tos; + tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.tun_flags = tun_flags; /* clear struct padding. */ - memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); } struct sw_flow_key { @@ -190,8 +194,8 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, - struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, + struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 22c855fa0bc2..5d6194d9dadc 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1148,13 +1148,14 @@ out: return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); } -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len) { struct nlattr *a; a = reserve_sfa_size(sfa, nla_attr_size(len)); if (IS_ERR(a)) - return PTR_ERR(a); + return a; a->nla_type = attrtype; a->nla_len = nla_attr_size(len); @@ -1163,6 +1164,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in memcpy(nla_data(a), data, len); memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + return a; +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len); + if (IS_ERR(a)) + return PTR_ERR(a); + return 0; } @@ -1268,6 +1281,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct ovs_tunnel_info *tun_info; + struct nlattr *a; int err, start; ovs_match_init(&match, &key, NULL); @@ -1279,8 +1294,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*tun_info)); + if (IS_ERR(a)) + return PTR_ERR(a); + + tun_info = nla_data(a); + tun_info->tunnel = key.tun_key; + add_nested_action_end(*sfa, start); return err; @@ -1563,17 +1584,20 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) int err; switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); + err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); break; + } default: if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) return -EMSGSIZE; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 309cca6e816f..fe768bd600eb 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key; struct tnl_ptk_info tpi; + const struct ovs_key_ipv4_tunnel *tun_key; + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags)); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; } @@ -137,12 +139,12 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index f19539bb8adc..5fbff2c1ee49 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) /* Called with rcu_read_lock and BH disabled. */ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct vport *vport = vs->data; struct iphdr *iph; __be64 key; @@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); } static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) @@ -147,12 +147,12 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 5df8377fcfb1..3e50ee8a218c 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -432,7 +432,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) + struct ovs_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -445,9 +445,9 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_key = NULL; + OVS_CB(skb)->egress_tun_info = NULL; /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(tun_key, skb, &key); + error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 0efd62fef1e8..e28964aba021 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -207,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_key_ipv4_tunnel *); + struct ovs_tunnel_info *); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ -- cgit v1.2.3 From 6b205b2ca17e88ef5e10451b720056b790cc63a5 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:32 -0700 Subject: openvswitch: Factor out allocation and verification of actions. As the size of the flow key grows, it can put some pressure on the stack. This is particularly true in ovs_flow_cmd_set(), which needs several copies of the key on the stack. One of those uses is logically separate, so this factors it out to reduce stack pressure and improve readibility. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f6bd93d5f435..010125c48244 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -933,11 +933,34 @@ error: return error; } +static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask) +{ + struct sw_flow_actions *acts; + struct sw_flow_key masked_key; + int error; + + acts = ovs_nla_alloc_flow_actions(nla_len(a)); + if (IS_ERR(acts)) + return acts; + + ovs_flow_mask_key(&masked_key, key, mask); + error = ovs_nla_copy_actions(a, &masked_key, 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); + kfree(acts); + return ERR_PTR(error); + } + + return acts; +} + static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key, masked_key; + struct sw_flow_key key; struct sw_flow *flow; struct sw_flow_mask mask; struct sk_buff *reply = NULL; @@ -959,17 +982,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) + acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask); + if (IS_ERR(acts)) { + error = PTR_ERR(acts); goto error; - - ovs_flow_mask_key(&masked_key, &key, &mask); - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], - &masked_key, 0, &acts); - if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree_acts; } } -- cgit v1.2.3 From f5796684069e0c71c65bce6a6d4766114aec1396 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:33 -0700 Subject: openvswitch: Add support for Geneve tunneling. The Openvswitch implementation is completely agnostic to the options that are in use and can handle newly defined options without further work. It does this by simply matching on a byte array of options and allowing userspace to setup flows on this array. Signed-off-by: Jesse Gross Singed-off-by: Ansis Atteka Signed-off-by: Andy Zhou Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 21 ++-- include/uapi/linux/openvswitch.h | 2 + net/openvswitch/Kconfig | 11 ++ net/openvswitch/Makefile | 4 + net/openvswitch/datapath.c | 5 +- net/openvswitch/flow.c | 20 +++- net/openvswitch/flow.h | 20 +++- net/openvswitch/flow_netlink.c | 176 ++++++++++++++++++++++++----- net/openvswitch/vport-geneve.c | 236 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- net/openvswitch/vport.c | 3 + net/openvswitch/vport.h | 1 + 13 files changed, 461 insertions(+), 42 deletions(-) create mode 100644 net/openvswitch/vport-geneve.c (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a9ce1556d773..5bc6edeb7143 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -86,17 +86,18 @@ struct ip_tunnel { struct gro_cells gro_cells; }; -#define TUNNEL_CSUM __cpu_to_be16(0x01) -#define TUNNEL_ROUTING __cpu_to_be16(0x02) -#define TUNNEL_KEY __cpu_to_be16(0x04) -#define TUNNEL_SEQ __cpu_to_be16(0x08) -#define TUNNEL_STRICT __cpu_to_be16(0x10) -#define TUNNEL_REC __cpu_to_be16(0x20) -#define TUNNEL_VERSION __cpu_to_be16(0x40) -#define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_CSUM __cpu_to_be16(0x01) +#define TUNNEL_ROUTING __cpu_to_be16(0x02) +#define TUNNEL_KEY __cpu_to_be16(0x04) +#define TUNNEL_SEQ __cpu_to_be16(0x08) +#define TUNNEL_STRICT __cpu_to_be16(0x10) +#define TUNNEL_REC __cpu_to_be16(0x20) +#define TUNNEL_VERSION __cpu_to_be16(0x40) +#define TUNNEL_NO_KEY __cpu_to_be16(0x80) #define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) -#define TUNNEL_OAM __cpu_to_be16(0x0200) -#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800) struct tnl_ptk_info { __be16 flags; diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 6753032832e2..435eabc5ffaa 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -192,6 +192,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ + OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 6ecf491ad509..ba3bb8203b99 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_GENEVE + bool "Open vSwitch Geneve tunneling support" + depends on INET + depends on OPENVSWITCH + depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 3591cb5dae91..9a33a273c375 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) +openvswitch-y += vport-geneve.o +endif + ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) openvswitch-y += vport-vxlan.o endif diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 010125c48244..2e31d9e7f4dc 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -370,6 +370,7 @@ static size_t key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); - rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->egress_tun_info = NULL; OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2924cb340868..62db02ba36bc 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) int error; struct ethhdr *eth; + /* Flags are always used as part of stats */ + key->tp.flags = 0; + skb_reset_mac_header(skb); /* Link layer. We are guaranteed to have at least the 14 byte Ethernet @@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_info) + if (tun_info) { memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); - else + + if (tun_info->options) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + memcpy(GENEVE_OPTS(key, tun_info->options_len), + tun_info->options, tun_info->options_len); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); + } key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe5a71b81c1f..71813318c8c7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel { struct ovs_tunnel_info { struct ovs_key_ipv4_tunnel tunnel; + struct geneve_opt *options; + u8 options_len; }; +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define GENEVE_OPTS(flow_key, opt_len) \ + ((struct geneve_opt *)((flow_key)->tun_opts + \ + FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ + opt_len)) + static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, const struct iphdr *iph, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) { tun_info->tunnel.tun_id = tun_id; tun_info->tunnel.ipv4_src = iph->saddr; @@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, /* clear struct padding. */ memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; } struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 5d6194d9dadc..368f23307911 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match, } \ } while (0) -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range__(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; @@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, int rem; bool ttl = false; __be16 tun_flags = 0; + unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, [OVS_TUNNEL_KEY_ATTR_OAM] = 0, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type] != nla_len(a)) { + if (ovs_tunnel_key_lens[type] != nla_len(a) && + ovs_tunnel_key_lens[type] != -1) { OVS_NLERR("IPv4 tunnel attribute type has unexpected " " length (type=%d, length=%d, expected=%d).\n", type, nla_len(a), ovs_tunnel_key_lens[type]); @@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_OAM: tun_flags |= TUNNEL_OAM; break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + tun_flags |= TUNNEL_OPTIONS_PRESENT; + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", + nla_len(a), + sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", + match->key->tun_opts_len, + nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, + true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS( + (struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, + nla_data(a), nla_len(a), + is_mask); + break; default: + OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + type); return -EINVAL; } } @@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) +static int __ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) { - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; @@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; + if (tun_opts && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; - nla_nest_end(skb, nla); return 0; } +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { @@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask) && - ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) - goto nla_put_failure; + if ((swkey->tun_key.ipv4_dst || is_mask)) { + const struct geneve_opt *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = GENEVE_OPTS(output, swkey->tun_opts_len); + + if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len)) + goto nla_put_failure; + } if (swkey->phy.in_port == DP_MAX_PORTS) { if (is_mask && (output->phy.in_port == 0xffff)) @@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (err) return err; + if (key.tun_opts_len) { + struct geneve_opt *option = GENEVE_OPTS(&key, + key.tun_opts_len); + int opts_len = key.tun_opts_len; + bool crit_opt = false; + + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + }; + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); if (start < 0) return start; a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info)); + sizeof(*tun_info) + key.tun_opts_len); if (IS_ERR(a)) return PTR_ERR(a); tun_info = nla_data(a); tun_info->tunnel = key.tun_key; + tun_info->options_len = key.tun_opts_len; + + if (tun_info->options_len) { + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); + tun_info->options = (struct geneve_opt *)(tun_info + 1); + } else { + tun_info->options = NULL; + } add_nested_action_end(*sfa, start); @@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) return -EMSGSIZE; err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, - nla_data(ovs_key)); + tun_info->options_len ? + tun_info->options : NULL, + tun_info->options_len); if (err) return err; nla_nest_end(skb, start); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000000..5572d482f285 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/** + * struct geneve_port - Keeps track of open UDP ports + * @sock: The socket created for this port number. + * @name: vport name. + */ +struct geneve_port { + struct geneve_sock *gs; + char name[IFNAMSIZ]; +}; + +static LIST_HEAD(geneve_ports); + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit VNI to 64 bit tunnel ID. */ +static __be64 vni_to_tunnel_id(__u8 *vni) +{ +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct vport *vport = gs->rcv_data; + struct genevehdr *geneveh = geneve_hdr(skb); + int opts_len; + struct ovs_tunnel_info tun_info; + __be64 key; + __be16 flags; + + opts_len = geneveh->opt_len * 4; + + flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT | + (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | + (geneveh->oam ? TUNNEL_OAM : 0) | + (geneveh->critical ? TUNNEL_CRIT_OPT : 0); + + key = vni_to_tunnel_id(geneveh->vni); + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + geneveh->options, opts_len); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 sport; + + sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport); + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport)) + return -EMSGSIZE; + return 0; +} + +static void geneve_tnl_destroy(struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + geneve_sock_release(geneve_port->gs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct geneve_sock *gs; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + strncpy(geneve_port->name, parms->name, IFNAMSIZ); + + gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); + if (IS_ERR(gs)) { + ovs_vport_free(vport); + return (void *)gs; + } + geneve_port->gs = gs; + + return vport; +error: + return ERR_PTR(err); +} + +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *tun_info; + struct net *net = ovs_dp_get_net(vport->dp); + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport; + struct rtable *rt; + struct flowi4 fl; + u8 vni[3]; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + tunnel_id_to_vni(tun_key->tun_id, vni); + skb->ignore_df = 1; + + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, + tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->ipv4_ttl, df, sport, dport, + tun_key->tun_flags, vni, + tun_info->options_len, (u8 *)tun_info->options, + false); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *geneve_get_name(const struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + return geneve_port->name; +} + +const struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_tnl_create, + .destroy = geneve_tnl_destroy, + .get_name = geneve_get_name, + .get_options = geneve_get_options, + .send = geneve_tnl_send, +}; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index fe768bd600eb..108b82da2fd9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -106,7 +106,7 @@ static int gre_rcv(struct sk_buff *skb, key = key_to_tunnel_id(tpi->key, tpi->seq); ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 5fbff2c1ee49..2735e01dca73 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -66,7 +66,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); ovs_vport_receive(vport, skb, &tun_info); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3e50ee8a218c..53001b020ca7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_VXLAN &ovs_vxlan_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_GENEVE + &ovs_geneve_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e28964aba021..8942125de3a6 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -215,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; extern const struct vport_ops ovs_vxlan_vport_ops; +extern const struct vport_ops ovs_geneve_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3 From bec3cfdca36bf43cfa3751ad7b56db1a307e0760 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Oct 2014 20:59:19 -0700 Subject: net: skb_segment() provides list head and tail Its unfortunate we have to walk again skb list to find the tail after segmentation, even if data is probably hot in cpu caches. skb_segment() can store the tail of the list into segs->prev, and validate_xmit_skb_list() can immediately get the tail. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 27 +++++++++++++++------------ net/core/skbuff.c | 5 +++++ 2 files changed, 20 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1a90530f83ff..7d5691cc1f47 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2724,22 +2724,25 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d { struct sk_buff *next, *head = NULL, *tail; - while (skb) { + for (; skb != NULL; skb = next) { next = skb->next; skb->next = NULL; + + /* in case skb wont be segmented, point to itself */ + skb->prev = skb; + skb = validate_xmit_skb(skb, dev); - if (skb) { - struct sk_buff *end = skb; + if (!skb) + continue; - while (end->next) - end = end->next; - if (!head) - head = skb; - else - tail->next = skb; - tail = end; - } - skb = next; + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; } return head; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9a423e2c5766..7b3df0d518ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3083,6 +3083,11 @@ perform_csum_check: } } while ((offset += len) < head_skb->len); + /* Some callers want to get the end of the list. + * Put it in segs->prev to avoid walking the list. + * (see validate_xmit_skb_list() for example) + */ + segs->prev = tail; return segs; err: -- cgit v1.2.3 From f2600cf02b5b59aaee082c3485b7f01fc7f7b70c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Oct 2014 10:11:31 -0700 Subject: net: sched: avoid costly atomic operation in fq_dequeue() Standard qdisc API to setup a timer implies an atomic operation on every packet dequeue : qdisc_unthrottled() It turns out this is not really needed for FQ, as FQ has no concept of global qdisc throttling, being a qdisc handling many different flows, some of them can be throttled, while others are not. Fix is straightforward : add a 'bool throttle' to qdisc_watchdog_schedule_ns(), and remove calls to qdisc_unthrottled() in sch_fq. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 ++-- net/sched/sch_api.c | 5 +++-- net/sched/sch_fq.c | 6 ++---- net/sched/sch_tbf.c | 3 ++- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index e4b3c828c1c2..27a33833ff4a 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -65,12 +65,12 @@ struct qdisc_watchdog { }; void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle); static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { - qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); + qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires), true); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c79a226cc25c..2cf61b3e633c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -594,13 +594,14 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - qdisc_throttled(wd->qdisc); + if (throttle) + qdisc_throttled(wd->qdisc); hrtimer_start(&wd->timer, ns_to_ktime(expires), diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index c9b9fcb53206..cbd7e1fd23b4 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -377,7 +377,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); q->inactive_flows--; - qdisc_unthrottled(sch); } /* Note: this overwrites f->age */ @@ -385,7 +384,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(f == &q->internal)) { q->stat_internal_packets++; - qdisc_unthrottled(sch); } sch->q.qlen++; @@ -433,7 +431,8 @@ begin: if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow); + q->time_next_delayed_flow, + false); return NULL; } } @@ -495,7 +494,6 @@ begin: } out: qdisc_bstats_update(sch, skb); - qdisc_unthrottled(sch); return skb; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 77edffe329c4..a4afde14e865 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -268,7 +268,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) } qdisc_watchdog_schedule_ns(&q->watchdog, - now + max_t(long, -toks, -ptoks)); + now + max_t(long, -toks, -ptoks), + true); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, -- cgit v1.2.3 From 1255a5055449781a92076fc5429952f2b33cf309 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 12:35:21 +0300 Subject: ethtool: Ethtool parameter to dynamically change tx_copybreak Use new ethtool [sg]et_tunable() to set tx_copybread (inline threshold) Signed-off-by: Eric Dumazet Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 1 + net/core/ethtool.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7a364f2f3d3f..99b43056a6fe 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -212,6 +212,7 @@ struct ethtool_value { enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, }; enum tunable_type_id { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 27e61b886520..1600aa24d36b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1625,6 +1625,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; -- cgit v1.2.3 From b47bd8d2795412143a23d2961033c126ce6914d9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 5 Oct 2014 17:27:50 +0200 Subject: ipv4: igmp: fix v3 general query drop monitor false positive In case we find a general query with non-zero number of sources, we are dropping the skb as it's malformed. RFC3376, section 4.1.8. Number of Sources (N): This number is zero in a General Query or a Group-Specific Query, and non-zero in a Group-and-Source-Specific Query. Therefore, reflect that by using kfree_skb() instead of consume_skb(). Fixes: d679c5324d9a ("igmp: avoid drop_monitor false positives") Signed-off-by: Daniel Borkmann Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4146153d875d..fb70e3ecc3e4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -931,7 +931,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, in_dev->mr_qrv = ih3->qrv; if (!group) { /* general query */ if (ih3->nsrcs) - return false; /* no sources allowed */ + return true; /* no sources allowed */ igmp_gq_start_timer(in_dev); return false; } -- cgit v1.2.3 From fcbeb976d7ce783fd58e63e61c196d9a8912b3be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 10:11:27 -0700 Subject: net: introduce netdevice gso_min_segs attribute Some TSO engines might have a too heavy setup cost, that impacts performance on hosts sending small bursts (2 MSS per packet). This patch adds a device gso_min_segs, allowing drivers to set a minimum segment size for TSO packets, according to the NIC performance. Tested on a mlx4 NIC, this allows to get a ~110% increase of throughput when sending 2 MSS per packet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- net/core/dev.c | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 22d54b9b700d..2df86f50261c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1416,6 +1416,8 @@ enum netdev_priv_flags { * @gso_max_size: Maximum size of generic segmentation offload * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @gso_min_segs: Minimum number of segments that can be passed to the + * NIC for GSO * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -1666,7 +1668,7 @@ struct net_device { unsigned int gso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; - + u16 gso_min_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif diff --git a/net/core/dev.c b/net/core/dev.c index 7d5691cc1f47..c1a4de275576 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { + const struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { @@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) } features = netdev_intersect_features(features, - skb->dev->vlan_features | + dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); @@ -6661,6 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); -- cgit v1.2.3 From 82a470f1119eb7d2e4941b915bf9cd6fd8d54494 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 5 Oct 2014 21:27:53 -0700 Subject: net: sched: remove tcf_proto from ematch calls This removes the tcf_proto argument from the ematch code paths that only need it to reference the net namespace. This allows simplifying qdisc code paths especially when we need to tear down the ematch from an RCU callback. In this case we can not guarentee that the tcf_proto structure is still valid. Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 10 +++++----- net/sched/cls_basic.c | 2 +- net/sched/cls_cgroup.c | 4 ++-- net/sched/cls_flow.c | 4 ++-- net/sched/em_canid.c | 4 ++-- net/sched/em_ipset.c | 7 +++---- net/sched/em_meta.c | 4 ++-- net/sched/em_nbyte.c | 2 +- net/sched/em_text.c | 4 ++-- net/sched/ematch.c | 10 ++++++---- 10 files changed, 26 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index ef44ad9a6426..bc49967e1a68 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -166,6 +166,7 @@ struct tcf_ematch { unsigned int datalen; u16 matchid; u16 flags; + struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) @@ -229,12 +230,11 @@ struct tcf_ematch_tree { struct tcf_ematch_ops { int kind; int datalen; - int (*change)(struct tcf_proto *, void *, + int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); - void (*destroy)(struct tcf_proto *, - struct tcf_ematch *); + void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; @@ -244,7 +244,7 @@ int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); -void tcf_em_tree_destroy(struct tcf_proto *, struct tcf_ematch_tree *); +void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); @@ -301,7 +301,7 @@ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) -#define tcf_em_tree_destroy(tp, t) do { (void)(t); } while(0) +#define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_change(tp, dst, src) do { } while(0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index fe20826a79e7..90647a8af8ca 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -95,7 +95,7 @@ static void basic_delete_filter(struct rcu_head *head) tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(&f->exts); - tcf_em_tree_destroy(tp, &f->ematches); + tcf_em_tree_destroy(&f->ematches); kfree(f); } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3409f168225f..2f77a89655dc 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -87,7 +87,7 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root) rcu); tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(head->tp, &head->ematches); + tcf_em_tree_destroy(&head->ematches); kfree(head); } @@ -157,7 +157,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) if (head) { tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(tp, &head->ematches); + tcf_em_tree_destroy(&head->ematches); RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index f18d27f7b5f2..a5d2b20db560 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -355,7 +355,7 @@ static void flow_destroy_filter(struct rcu_head *head) del_timer_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); - tcf_em_tree_destroy(f->tp, &f->ematches); + tcf_em_tree_destroy(&f->ematches); kfree(f); } @@ -530,7 +530,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return 0; err2: - tcf_em_tree_destroy(tp, &t); + tcf_em_tree_destroy(&t); kfree(fnew); err1: tcf_exts_destroy(&e); diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index 7c292d474f47..ddd883ca55b2 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -120,7 +120,7 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, return match; } -static int em_canid_change(struct tcf_proto *tp, void *data, int len, +static int em_canid_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ @@ -183,7 +183,7 @@ static int em_canid_change(struct tcf_proto *tp, void *data, int len, return 0; } -static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_canid_destroy(struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index 527aeb7a3ff0..5b4a4efe468c 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -19,12 +19,11 @@ #include #include -static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, +static int em_ipset_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct xt_set_info *set = data; ip_set_id_t index; - struct net *net = dev_net(qdisc_dev(tp->q)); if (data_len != sizeof(*set)) return -EINVAL; @@ -42,11 +41,11 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, return -ENOMEM; } -static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +static void em_ipset_destroy(struct tcf_ematch *em) { const struct xt_set_info *set = (const void *) em->data; if (set) { - ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); + ip_set_nfnl_put(em->net, set->index); kfree((void *) em->data); } } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 9b8c0b0e60d7..c8f8c399b99a 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -856,7 +856,7 @@ static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, }; -static int em_meta_change(struct tcf_proto *tp, void *data, int len, +static int em_meta_change(struct net *net, void *data, int len, struct tcf_ematch *m) { int err; @@ -908,7 +908,7 @@ errout: return err; } -static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_meta_destroy(struct tcf_ematch *m) { if (m) meta_delete((struct meta_match *) m->data); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index a3bed07a008b..df3110d69585 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -23,7 +23,7 @@ struct nbyte_data { char pattern[0]; }; -static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, +static int em_nbyte_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct tcf_em_nbyte *nbyte = data; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 15d353d2e4be..f03c3de16c27 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -45,7 +45,7 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; } -static int em_text_change(struct tcf_proto *tp, void *data, int len, +static int em_text_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct text_match *tm; @@ -100,7 +100,7 @@ retry: return 0; } -static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_text_destroy(struct tcf_ematch *m) { if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) textsearch_destroy(EM_TEXT_PRIV(m)->config); diff --git a/net/sched/ematch.c b/net/sched/ematch.c index ad57f4444b9c..8250c36543d8 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -178,6 +178,7 @@ static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); + struct net *net = dev_net(qdisc_dev(tp->q)); if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; @@ -240,7 +241,7 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { - err = em->ops->change(tp, data, data_len, em); + err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { @@ -271,6 +272,7 @@ static int tcf_em_validate(struct tcf_proto *tp, em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->datalen = data_len; + em->net = net; err = 0; errout: @@ -378,7 +380,7 @@ errout: return err; errout_abort: - tcf_em_tree_destroy(tp, tree); + tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); @@ -393,7 +395,7 @@ EXPORT_SYMBOL(tcf_em_tree_validate); * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ -void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) +void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; @@ -405,7 +407,7 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) if (em->ops) { if (em->ops->destroy) - em->ops->destroy(tp, em); + em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); -- cgit v1.2.3 From 13990f8156862fe945a1a226850a6550c8988a33 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 5 Oct 2014 21:28:20 -0700 Subject: net: sched: cls_cgroup tear down exts and ematch from rcu callback It is not RCU safe to destroy the action chain while there is a possibility of readers accessing it. Move this code into the rcu callback using the same rcu callback used in the code patch to make a change to head. Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 2f77a89655dc..d61a801222c1 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -156,10 +156,8 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (head) { - tcf_exts_destroy(&head->exts); - tcf_em_tree_destroy(&head->ematches); RCU_INIT_POINTER(tp->root, NULL); - kfree_rcu(head, rcu); + call_rcu(&head->rcu, cls_cgroup_destroy_rcu); } } -- cgit v1.2.3 From 18cdb37ebf4c986d9502405cbd16b0ac29770c25 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 5 Oct 2014 21:28:52 -0700 Subject: net: sched: do not use tcf_proto 'tp' argument from call_rcu Using the tcf_proto pointer 'tp' from inside the classifiers callback is not valid because it may have been cleaned up by another call_rcu occuring on another CPU. 'tp' is currently being used by tcf_unbind_filter() in this patch we move instances of tcf_unbind_filter outside of the call_rcu() context. This is safe to do because any running schedulers will either read the valid class field or it will be zeroed. And all schedulers today when the class is 0 do a lookup using the same call used by the tcf_exts_bind(). So even if we have a running classifier hit the null class pointer it will do a lookup and get to the same result. This is particularly fragile at the moment because the only way to verify this is to audit the schedulers call sites. Reported-by: Cong Wang Signed-off-by: John Fastabend Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 5 +++-- net/sched/cls_bpf.c | 4 +++- net/sched/cls_fw.c | 5 +++-- net/sched/cls_route.c | 8 +++++--- 4 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 90647a8af8ca..cd61280941e5 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -91,9 +91,7 @@ static int basic_init(struct tcf_proto *tp) static void basic_delete_filter(struct rcu_head *head) { struct basic_filter *f = container_of(head, struct basic_filter, rcu); - struct tcf_proto *tp = f->tp; - tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); kfree(f); @@ -106,6 +104,7 @@ static void basic_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); + tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, basic_delete_filter); } RCU_INIT_POINTER(tp->root, NULL); @@ -120,6 +119,7 @@ static int basic_delete(struct tcf_proto *tp, unsigned long arg) list_for_each_entry(t, &head->flist, link) if (t == f) { list_del_rcu(&t->link); + tcf_unbind_filter(tp, &t->res); call_rcu(&t->rcu, basic_delete_filter); return 0; } @@ -222,6 +222,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (fold) { list_replace_rcu(&fold->link, &fnew->link); + tcf_unbind_filter(tp, &fold->res); call_rcu(&fold->rcu, basic_delete_filter); } else { list_add_rcu(&fnew->link, &head->flist); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 4318d067b0a0..eed49d1d0878 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -92,7 +92,6 @@ static int cls_bpf_init(struct tcf_proto *tp) static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { - tcf_unbind_filter(tp, &prog->res); tcf_exts_destroy(&prog->exts); bpf_prog_destroy(prog->filter); @@ -116,6 +115,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) list_for_each_entry(prog, &head->plist, link) { if (prog == todel) { list_del_rcu(&prog->link); + tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); return 0; } @@ -131,6 +131,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) list_for_each_entry_safe(prog, tmp, &head->plist, link) { list_del_rcu(&prog->link); + tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); } @@ -282,6 +283,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (oldprog) { list_replace_rcu(&prog->link, &oldprog->link); + tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, __cls_bpf_delete_prog); } else { list_add_rcu(&prog->link, &head->plist); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index da805aeeb65c..dbfdfd1f1a9f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -123,9 +123,7 @@ static int fw_init(struct tcf_proto *tp) static void fw_delete_filter(struct rcu_head *head) { struct fw_filter *f = container_of(head, struct fw_filter, rcu); - struct tcf_proto *tp = f->tp; - tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(&f->exts); kfree(f); } @@ -143,6 +141,7 @@ static void fw_destroy(struct tcf_proto *tp) while ((f = rtnl_dereference(head->ht[h])) != NULL) { RCU_INIT_POINTER(head->ht[h], rtnl_dereference(f->next)); + tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fw_delete_filter); } } @@ -166,6 +165,7 @@ static int fw_delete(struct tcf_proto *tp, unsigned long arg) fp = &pfp->next, pfp = rtnl_dereference(*fp)) { if (pfp == f) { RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); + tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fw_delete_filter); return 0; } @@ -280,6 +280,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next)); rcu_assign_pointer(*fp, fnew); + tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fw_delete_filter); *arg = (unsigned long)fnew; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index b665aee661f7..6f22baae0afa 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -269,9 +269,7 @@ static void route4_delete_filter(struct rcu_head *head) { struct route4_filter *f = container_of(head, struct route4_filter, rcu); - struct tcf_proto *tp = f->tp; - tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(&f->exts); kfree(f); } @@ -297,6 +295,7 @@ static void route4_destroy(struct tcf_proto *tp) next = rtnl_dereference(f->next); RCU_INIT_POINTER(b->ht[h2], next); + tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, route4_delete_filter); } } @@ -338,6 +337,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) route4_reset_fastmap(head); /* Delete it */ + tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, route4_delete_filter); /* Strip RTNL protected tree */ @@ -545,8 +545,10 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, route4_reset_fastmap(head); *arg = (unsigned long)f; - if (fold) + if (fold) { + tcf_unbind_filter(tp, &fold->res); call_rcu(&fold->rcu, route4_delete_filter); + } return 0; errout: -- cgit v1.2.3 From 79952bca8619b62c9b1a118238ca16ab41be7760 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 6 Oct 2014 20:15:20 +0200 Subject: net: fix rcu access on phonet_routes -Add __rcu annotation on table to fix sparse warnings: net/phonet/pn_dev.c:279:25: warning: incorrect type in assignment (different address spaces) net/phonet/pn_dev.c:279:25: expected struct net_device * net/phonet/pn_dev.c:279:25: got void [noderef] * net/phonet/pn_dev.c:376:17: warning: incorrect type in assignment (different address spaces) net/phonet/pn_dev.c:376:17: expected struct net_device *volatile net/phonet/pn_dev.c:376:17: got struct net_device [noderef] * net/phonet/pn_dev.c:392:17: warning: incorrect type in assignment (different address spaces) net/phonet/pn_dev.c:392:17: expected struct net_device * net/phonet/pn_dev.c:392:17: got void [noderef] * -Access table with rcu_access_pointer (fixes the following sparse errors): net/phonet/pn_dev.c:278:25: error: incompatible types in comparison expression (different address spaces) net/phonet/pn_dev.c:391:17: error: incompatible types in comparison expression (different address spaces) Suggested-by: Eric Dumazet Signed-off-by: Fabian Frederick Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/phonet/pn_dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 56a6146ac94b..a58680016472 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -36,7 +36,7 @@ struct phonet_routes { struct mutex lock; - struct net_device *table[64]; + struct net_device __rcu *table[64]; }; struct phonet_net { @@ -275,7 +275,7 @@ static void phonet_route_autodel(struct net_device *dev) bitmap_zero(deleted, 64); mutex_lock(&pnn->routes.lock); for (i = 0; i < 64; i++) - if (dev == pnn->routes.table[i]) { + if (rcu_access_pointer(pnn->routes.table[i]) == dev) { RCU_INIT_POINTER(pnn->routes.table[i], NULL); set_bit(i, deleted); } @@ -388,7 +388,7 @@ int phonet_route_del(struct net_device *dev, u8 daddr) daddr = daddr >> 2; mutex_lock(&routes->lock); - if (dev == routes->table[daddr]) + if (rcu_access_pointer(routes->table[daddr]) == dev) RCU_INIT_POINTER(routes->table[daddr], NULL); else dev = NULL; -- cgit v1.2.3 From 1ff0dc9499b25d016777f9b8d3ee486fd588ba59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Oct 2014 11:26:27 -0700 Subject: net: validate_xmit_vlan() is static Marking this as static allows compiler to inline it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c1a4de275576..a63b8c43c1b6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,7 +2645,8 @@ out: return skb; } -struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) { if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { -- cgit v1.2.3 From 94b2cfe02bfe3f1918d91bd6f498e308c5605cbc Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:34 +0200 Subject: ipv6: minor fib6 cleanups like type safety, bool conversion, inline removal Also renamed struct fib6_walker_t to fib6_walker and enum fib_walk_state_t to fib6_walk_state as recommended by Cong Wang. Cc: Cong Wang Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 18 ++++++++++--- net/ipv6/ip6_fib.c | 71 ++++++++++++++++++++++----------------------------- 2 files changed, 45 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cf485f9aa563..9221bf4c64f7 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -202,15 +202,25 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -struct fib6_walker_t { +enum fib6_walk_state { +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; struct rt6_info *leaf; - unsigned char state; - unsigned char prune; + enum fib6_walk_state state; + bool prune; unsigned int skip; unsigned int count; - int (*func)(struct fib6_walker_t *); + int (*func)(struct fib6_walker *); void *args; }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 97b9fa8de377..e8d7465b1597 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -46,18 +46,8 @@ static struct kmem_cache *fib6_node_kmem __read_mostly; -enum fib_walk_state_t { -#ifdef CONFIG_IPV6_SUBTREES - FWS_S, -#endif - FWS_L, - FWS_R, - FWS_C, - FWS_U -}; - -struct fib6_cleaner_t { - struct fib6_walker_t w; +struct fib6_cleaner { + struct fib6_walker w; struct net *net; int (*func)(struct rt6_info *, void *arg); void *arg; @@ -74,8 +64,8 @@ static DEFINE_RWLOCK(fib6_walker_lock); static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker_t *w); -static int fib6_walk_continue(struct fib6_walker_t *w); +static int fib6_walk(struct fib6_walker *w); +static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the @@ -91,20 +81,21 @@ static void fib6_gc_timer_cb(unsigned long arg); static LIST_HEAD(fib6_walkers); #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) -static inline void fib6_walker_link(struct fib6_walker_t *w) +static void fib6_walker_link(struct fib6_walker *w) { write_lock_bh(&fib6_walker_lock); list_add(&w->lh, &fib6_walkers); write_unlock_bh(&fib6_walker_lock); } -static inline void fib6_walker_unlink(struct fib6_walker_t *w) +static void fib6_walker_unlink(struct fib6_walker *w) { write_lock_bh(&fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&fib6_walker_lock); } -static __inline__ u32 fib6_new_sernum(void) + +static u32 fib6_new_sernum(void) { u32 n = ++rt_sernum; if ((__s32)n <= 0) @@ -128,7 +119,7 @@ static __inline__ u32 fib6_new_sernum(void) # define BITOP_BE32_SWIZZLE 0 #endif -static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) +static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* @@ -142,7 +133,7 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static __inline__ struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(void) { struct fib6_node *fn; @@ -151,12 +142,12 @@ static __inline__ struct fib6_node *node_alloc(void) return fn; } -static __inline__ void node_free(struct fib6_node *fn) +static void node_free(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } -static __inline__ void rt6_release(struct rt6_info *rt) +static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) dst_free(&rt->dst); @@ -267,7 +258,7 @@ static void __net_init fib6_tables_init(struct net *net) #endif -static int fib6_dump_node(struct fib6_walker_t *w) +static int fib6_dump_node(struct fib6_walker *w) { int res; struct rt6_info *rt; @@ -287,7 +278,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void *)cb->args[2]; + struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { @@ -310,7 +301,7 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { - struct fib6_walker_t *w; + struct fib6_walker *w; int res; w = (void *)cb->args[2]; @@ -355,7 +346,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int h, s_h; unsigned int e = 0, s_e; struct rt6_rtnl_dump_arg arg; - struct fib6_walker_t *w; + struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; @@ -627,7 +618,7 @@ insert_above: return ln; } -static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +static bool rt6_qualify_for_ecmp(struct rt6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; @@ -820,7 +811,7 @@ add: return 0; } -static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) +static void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) @@ -1174,7 +1165,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, int children; int nstate; struct fib6_node *child, *pn; - struct fib6_walker_t *w; + struct fib6_walker *w; int iter = 0; for (;;) { @@ -1276,7 +1267,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, struct nl_info *info) { - struct fib6_walker_t *w; + struct fib6_walker *w; struct rt6_info *rt = *rtp; struct net *net = info->nl_net; @@ -1414,7 +1405,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * <0 -> walk is terminated by an error. */ -static int fib6_walk_continue(struct fib6_walker_t *w) +static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn; @@ -1498,7 +1489,7 @@ skip: } } -static int fib6_walk(struct fib6_walker_t *w) +static int fib6_walk(struct fib6_walker *w) { int res; @@ -1512,11 +1503,11 @@ static int fib6_walk(struct fib6_walker_t *w) return res; } -static int fib6_clean_node(struct fib6_walker_t *w) +static int fib6_clean_node(struct fib6_walker *w) { int res; struct rt6_info *rt; - struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, }; @@ -1554,9 +1545,9 @@ static int fib6_clean_node(struct fib6_walker_t *w) static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) + bool prune, void *arg) { - struct fib6_cleaner_t c; + struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; @@ -1583,7 +1574,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, 0, arg); + func, false, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1602,7 +1593,7 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); + fib6_clean_tree(net, fn, fib6_prune_clone, true, NULL); } static int fib6_update_sernum(struct rt6_info *rt, void *arg) @@ -1828,7 +1819,7 @@ void fib6_gc_cleanup(void) struct ipv6_route_iter { struct seq_net_private p; - struct fib6_walker_t w; + struct fib6_walker w; loff_t skip; struct fib6_table *tbl; __u32 sernum; @@ -1859,7 +1850,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) return 0; } -static int ipv6_route_yield(struct fib6_walker_t *w) +static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; @@ -1980,7 +1971,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { - struct fib6_walker_t *w = &iter->w; + struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } -- cgit v1.2.3 From 42b18706469a02c1f84375ac0ee2f30f28d85d4c Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:35 +0200 Subject: ipv6: make rt_sernum atomic and serial number fields ordinary ints Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 9221bf4c64f7..8eea35d32a75 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -64,7 +64,7 @@ struct fib6_node { __u16 fn_bit; /* bit key */ __u16 fn_flags; - __u32 fn_sernum; + int fn_sernum; struct rt6_info *rr_ptr; }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e8d7465b1597..332f1e0ff8e2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -74,7 +74,7 @@ static int fib6_walk_continue(struct fib6_walker *w); * result of redirects, path MTU changes, etc. */ -static __u32 rt_sernum; +static atomic_t rt_sernum = ATOMIC_INIT(1); static void fib6_gc_timer_cb(unsigned long arg); @@ -95,12 +95,15 @@ static void fib6_walker_unlink(struct fib6_walker *w) write_unlock_bh(&fib6_walker_lock); } -static u32 fib6_new_sernum(void) +static int fib6_new_sernum(void) { - u32 n = ++rt_sernum; - if ((__s32)n <= 0) - rt_sernum = n = 1; - return n; + int new, old; + + do { + old = atomic_read(&rt_sernum); + new = old < INT_MAX ? old + 1 : 1; + } while (atomic_cmpxchg(&rt_sernum, old, new) != old); + return new; } /* @@ -421,7 +424,7 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, struct rt6key *key; int bit; __be32 dir = 0; - __u32 sernum = fib6_new_sernum(); + int sernum = fib6_new_sernum(); RT6_TRACE("fib6_add_1\n"); @@ -1598,7 +1601,7 @@ static void fib6_prune_clones(struct net *net, struct fib6_node *fn) static int fib6_update_sernum(struct rt6_info *rt, void *arg) { - __u32 sernum = *(__u32 *)arg; + int sernum = *(int *)arg; if (rt->rt6i_node && rt->rt6i_node->fn_sernum != sernum) @@ -1609,7 +1612,7 @@ static int fib6_update_sernum(struct rt6_info *rt, void *arg) static void fib6_flush_trees(struct net *net) { - __u32 new_sernum = fib6_new_sernum(); + int new_sernum = fib6_new_sernum(); fib6_clean_all(net, fib6_update_sernum, &new_sernum); } @@ -1822,7 +1825,7 @@ struct ipv6_route_iter { struct fib6_walker w; loff_t skip; struct fib6_table *tbl; - __u32 sernum; + int sernum; }; static int ipv6_route_seq_show(struct seq_file *seq, void *v) -- cgit v1.2.3 From c8c4d42a6b46ea9f81816c47ac8d6ae25cf533fc Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:36 +0200 Subject: ipv6: only generate one new serial number per fib mutation Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 332f1e0ff8e2..be9cb09b05f7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -417,14 +417,13 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required) + int replace_required, int sernum) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; - int sernum = fib6_new_sernum(); RT6_TRACE("fib6_add_1\n"); @@ -842,6 +841,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, int err = -ENOMEM; int allow_create = 1; int replace_required = 0; + int sernum = fib6_new_sernum(); if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -854,7 +854,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required); + replace_required, sernum); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -888,14 +888,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = fib6_new_sernum(); + sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required); + allow_create, replace_required, sernum); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -914,7 +914,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required); + allow_create, replace_required, sernum); if (IS_ERR(sn)) { err = PTR_ERR(sn); -- cgit v1.2.3 From 812918c464eca0e8c145f975932ca5020e9c05cb Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:37 +0200 Subject: ipv6: make fib6 serial number per namespace Try to reduce number of possible fn_sernum mutation by constraining them to their namespace. Also remove rt_genid which I forgot to remove in 705f1c869d577c ("ipv6: remove rt6i_genid"). Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/ip6_fib.c | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index eade27adecf3..69ae41f2098c 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -76,7 +76,7 @@ struct netns_ipv6 { #endif #endif atomic_t dev_addr_genid; - atomic_t rt_genid; + atomic_t fib6_sernum; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 34f726f59814..e8c4400f23e9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -766,7 +766,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = 0; - atomic_set(&net->ipv6.rt_genid, 0); + atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); if (err) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index be9cb09b05f7..6f9beb1f2861 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -74,8 +74,6 @@ static int fib6_walk_continue(struct fib6_walker *w); * result of redirects, path MTU changes, etc. */ -static atomic_t rt_sernum = ATOMIC_INIT(1); - static void fib6_gc_timer_cb(unsigned long arg); static LIST_HEAD(fib6_walkers); @@ -95,14 +93,15 @@ static void fib6_walker_unlink(struct fib6_walker *w) write_unlock_bh(&fib6_walker_lock); } -static int fib6_new_sernum(void) +static int fib6_new_sernum(struct net *net) { int new, old; do { - old = atomic_read(&rt_sernum); + old = atomic_read(&net->ipv6.fib6_sernum); new = old < INT_MAX ? old + 1 : 1; - } while (atomic_cmpxchg(&rt_sernum, old, new) != old); + } while (atomic_cmpxchg(&net->ipv6.fib6_sernum, + old, new) != old); return new; } @@ -841,7 +840,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, int err = -ENOMEM; int allow_create = 1; int replace_required = 0; - int sernum = fib6_new_sernum(); + int sernum = fib6_new_sernum(info->nl_net); if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1612,7 +1611,7 @@ static int fib6_update_sernum(struct rt6_info *rt, void *arg) static void fib6_flush_trees(struct net *net) { - int new_sernum = fib6_new_sernum(); + int new_sernum = fib6_new_sernum(net); fib6_clean_all(net, fib6_update_sernum, &new_sernum); } -- cgit v1.2.3 From 327571cb100cad587c9eda351e7a2d182466873b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 6 Oct 2014 19:58:38 +0200 Subject: ipv6: don't walk node's leaf during serial number update Cc: YOSHIFUJI Hideaki Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6f9beb1f2861..b2d1838897c9 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -50,6 +50,7 @@ struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct rt6_info *, void *arg); + int sernum; void *arg; }; @@ -105,6 +106,10 @@ static int fib6_new_sernum(struct net *net) return new; } +enum { + FIB6_NO_SERNUM_CHANGE = 0, +}; + /* * Auxiliary address test functions for the radix tree. * @@ -1514,6 +1519,16 @@ static int fib6_clean_node(struct fib6_walker *w) .nl_net = c->net, }; + if (c->sernum != FIB6_NO_SERNUM_CHANGE && + w->node->fn_sernum != c->sernum) + w->node->fn_sernum = c->sernum; + + if (!c->func) { + WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); + w->leaf = NULL; + return 0; + } + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = c->func(rt, c->arg); if (res < 0) { @@ -1547,7 +1562,7 @@ static int fib6_clean_node(struct fib6_walker *w) static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - bool prune, void *arg) + bool prune, int sernum, void *arg) { struct fib6_cleaner c; @@ -1557,14 +1572,16 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.count = 0; c.w.skip = 0; c.func = func; + c.sernum = sernum; c.arg = arg; c.net = net; fib6_walk(&c.w); } -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), - void *arg) +static void __fib6_clean_all(struct net *net, + int (*func)(struct rt6_info *, void *), + int sernum, void *arg) { struct fib6_table *table; struct hlist_head *head; @@ -1576,13 +1593,19 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, false, arg); + func, false, sernum, arg); write_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } +void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), + void *arg) +{ + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); +} + static int fib6_prune_clone(struct rt6_info *rt, void *arg) { if (rt->rt6i_flags & RTF_CACHE) { @@ -1595,25 +1618,15 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(net, fn, fib6_prune_clone, true, NULL); -} - -static int fib6_update_sernum(struct rt6_info *rt, void *arg) -{ - int sernum = *(int *)arg; - - if (rt->rt6i_node && - rt->rt6i_node->fn_sernum != sernum) - rt->rt6i_node->fn_sernum = sernum; - - return 0; + fib6_clean_tree(net, fn, fib6_prune_clone, true, + FIB6_NO_SERNUM_CHANGE, NULL); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); - fib6_clean_all(net, fib6_update_sernum, &new_sernum); + __fib6_clean_all(net, NULL, new_sernum, NULL); } /* -- cgit v1.2.3 From 42350dcaaf1d8d95d58e8b43aee006d62c84bc2e Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 6 Oct 2014 13:22:50 -0700 Subject: net: fix a sparse warning Fix a sparse warning introduced by Commit 0b5e8b8eeae40bae6ad7c7e91c97c3c0d0e57882 (net: Add Geneve tunneling protocol driver) caught by kbuild test robot: # apt-get install sparse # git checkout 0b5e8b8eeae40bae6ad7c7e91c97c3c0d0e57882 # make ARCH=x86_64 allmodconfig # make C=1 CF=-D__CHECK_ENDIAN__ # # # sparse warnings: (new ones prefixed by >>) # # >> net/ipv4/geneve.c:230:42: sparse: incorrect type in assignment (different base types) # net/ipv4/geneve.c:230:42: expected restricted __be32 [addressable] [assigned] [usertype] s_addr # net/ipv4/geneve.c:230:42: got unsigned long [unsigned] # Reported-by: kbuild test robot Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index f008c5515f48..065cd94c640c 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -227,7 +227,7 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, udp_conf.family = AF_INET6; } else { udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = INADDR_ANY; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); } udp_conf.local_udp_port = port; -- cgit v1.2.3 From 0a5d1c55faa5414858857875496f6f6a9926fa51 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 6 Oct 2014 13:22:51 -0700 Subject: openvswitch: fix a sparse warning Fix a sparse warning introduced by commit: f5796684069e0c71c65bce6a6d4766114aec1396 (openvswitch: Add support for Geneve tunneling.) caught by kbuild test robot: reproduce: # apt-get install sparse # git checkout f5796684069e0c71c65bce6a6d4766114aec1396 # make ARCH=x86_64 allmodconfig # make C=1 CF=-D__CHECK_ENDIAN__ # # # sparse warnings: (new ones prefixed by >>) # # >> net/openvswitch/vport-geneve.c:109:15: sparse: incorrect type in assignment (different base types) # net/openvswitch/vport-geneve.c:109:15: expected restricted __be16 [usertype] sport # net/openvswitch/vport-geneve.c:109:15: got int # >> net/openvswitch/vport-geneve.c:110:56: sparse: incorrect type in argument 3 (different base types) # net/openvswitch/vport-geneve.c:110:56: expected unsigned short [unsigned] [usertype] value # net/openvswitch/vport-geneve.c:110:56: got restricted __be16 [usertype] sport Reported-by: kbuild test robot Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- net/openvswitch/vport-geneve.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 5572d482f285..910b3ef2c0d5 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -104,10 +104,9 @@ static int geneve_get_options(const struct vport *vport, struct sk_buff *skb) { struct geneve_port *geneve_port = geneve_vport(vport); - __be16 sport; + struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); - sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport)) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) return -EMSGSIZE; return 0; } -- cgit v1.2.3 From 7c5df8fa1921450d2210db9928f43cf4f414982c Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 6 Oct 2014 15:15:14 -0700 Subject: openvswitch: fix a compilation error when CONFIG_INET is not setW! Fix a openvswitch compilation error when CONFIG_INET is not set: ===================================================== In file included from include/net/geneve.h:4:0, from net/openvswitch/flow_netlink.c:45: include/net/udp_tunnel.h: In function 'udp_tunnel_handle_offloads': >> include/net/udp_tunnel.h:100:2: error: implicit declaration of function 'iptunnel_handle_offloads' [-Werror=implicit-function-declaration] >> return iptunnel_handle_offloads(skb, udp_csum, type); >> ^ >> >> include/net/udp_tunnel.h:100:2: warning: return makes pointer from integer without a cast >> >> cc1: some warnings being treated as errors ===================================================== Reported-by: kbuild test robot Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- drivers/net/Kconfig | 1 - include/net/geneve.h | 36 +++++++++++++++++++++--------------- net/ipv4/Kconfig | 29 +++++++++++++++-------------- 3 files changed, 36 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index c6f6f69f8961..4706386b7d34 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -147,7 +147,6 @@ config MACVTAP config VXLAN tristate "Virtual eXtensible Local Area Network (VXLAN)" depends on INET - select NET_IP_TUNNEL select NET_UDP_TUNNEL ---help--- This allows one to create vxlan virtual interfaces that provide diff --git a/include/net/geneve.h b/include/net/geneve.h index ce98865318bf..112132cf8e2e 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -1,22 +1,10 @@ #ifndef __NET_GENEVE_H #define __NET_GENEVE_H 1 +#ifdef CONFIG_INET #include +#endif -struct geneve_sock; - -typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); - -struct geneve_sock { - struct hlist_node hlist; - geneve_rcv_t *rcv; - void *rcv_data; - struct work_struct del_work; - struct socket *sock; - struct rcu_head rcu; - atomic_t refcnt; - struct udp_offload udp_offloads; -}; /* Geneve Header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -74,6 +62,22 @@ struct genevehdr { struct geneve_opt options[]; }; +#ifdef CONFIG_INET +struct geneve_sock; + +typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); + +struct geneve_sock { + struct hlist_node hlist; + geneve_rcv_t *rcv; + void *rcv_data; + struct work_struct del_work; + struct socket *sock; + struct rcu_head rcu; + atomic_t refcnt; + struct udp_offload udp_offloads; +}; + #define GENEVE_VER 0 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) @@ -88,4 +92,6 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, bool xnet); -#endif +#endif /*ifdef CONFIG_INET */ + +#endif /*ifdef__NET_GENEVE_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c2035447e649..e682b48e0709 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -309,6 +309,7 @@ config NET_IPVTI config NET_UDP_TUNNEL tristate + select NET_IP_TUNNEL default n config NET_FOU @@ -321,6 +322,20 @@ config NET_FOU network mechanisms and optimizations for UDP (such as ECMP and RSS) can be leveraged to provide better service. +config GENEVE + tristate "Generic Network Virtualization Encapsulation (Geneve)" + depends on INET + select NET_UDP_TUNNEL + ---help--- + This allows one to create Geneve virtual interfaces that provide + Layer 2 Networks over Layer 3 Networks. Geneve is often used + to tunnel virtual network infrastructure in virtualized environments. + For more information see: + http://tools.ietf.org/html/draft-gross-geneve-01 + + To compile this driver as a module, choose M here: the module + + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO @@ -453,20 +468,6 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" - depends on INET - select NET_IP_TUNNEL - select NET_UDP_TUNNEL - ---help--- - This allows one to create Geneve virtual interfaces that provide - Layer 2 Networks over Layer 3 Networks. Geneve is often used - to tunnel virtual network infrastructure in virtualized environments. - For more information see: - http://tools.ietf.org/html/draft-gross-geneve-01 - - To compile this driver as a module, choose M here: the module - config TCP_CONG_CUBIC tristate "CUBIC TCP" default y -- cgit v1.2.3 From 02c0fc1b8f41e6e895d6573615ba8ff549b685d2 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 6 Oct 2014 17:01:33 -0700 Subject: net_sched: fix unused variables in __gnet_stats_copy_basic_cpu() Probably not a big deal, but we'd better just use the one we get in retry loop. Fixes: commit 22e0f8b9322cb1a48b1357e8 ("net: sched: make bstats per cpu and estimator RCU safe") Reported-by: Joe Perches Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/gen_stats.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 14681b97a4f3..0c08062d1796 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -106,8 +106,8 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); unsigned int start; - __u64 bytes; - __u32 packets; + u64 bytes; + u32 packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); @@ -115,8 +115,8 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, packets = bcpu->bstats.packets; } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); - bstats->bytes += bcpu->bstats.bytes; - bstats->packets += bcpu->bstats.packets; + bstats->bytes += bytes; + bstats->packets += packets; } } -- cgit v1.2.3 From 0287587884b15041203b3a362d485e1ab1f24445 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 18:38:35 -0700 Subject: net: better IFF_XMIT_DST_RELEASE support Testing xmit_more support with netperf and connected UDP sockets, I found strange dst refcount false sharing. Current handling of IFF_XMIT_DST_RELEASE is not optimal. Dropping dst in validate_xmit_skb() is certainly too late in case packet was queued by cpu X but dequeued by cpu Y The logical point to take care of drop/force is in __dev_queue_xmit() before even taking qdisc lock. As Julian Anastasov pointed out, need for skb_dst() might come from some packet schedulers or classifiers. This patch adds new helper to cleanly express needs of various drivers or qdiscs/classifiers. Drivers that need skb_dst() in their ndo_start_xmit() should call following helper in their setup instead of the prior : dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; -> netif_keep_dst(dev); Instead of using a single bit, we use two bits, one being eventually rebuilt in bonding/team drivers. The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being rebuilt in bonding/team. Eventually, we could add something smarter later. Signed-off-by: Eric Dumazet Cc: Julian Anastasov Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- drivers/net/appletalk/ipddp.c | 2 +- drivers/net/bonding/bond_main.c | 9 ++++++--- drivers/net/eql.c | 2 +- drivers/net/ifb.c | 3 ++- drivers/net/loopback.c | 2 +- drivers/net/macvlan.c | 3 ++- drivers/net/ppp/ppp_generic.c | 2 +- drivers/net/team/team.c | 8 +++++--- drivers/net/vxlan.c | 2 +- drivers/net/wan/hdlc_fr.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- include/linux/netdevice.h | 8 ++++++++ net/8021q/vlan_dev.c | 3 ++- net/atm/clip.c | 2 +- net/core/dev.c | 19 +++++++++---------- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 2 +- net/sched/cls_flow.c | 2 ++ net/sched/cls_route.c | 1 + net/sched/sch_generic.c | 3 --- net/sched/sch_teql.c | 2 +- 27 files changed, 54 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 13e6e0431592..58b5aa3b6f2d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1364,7 +1364,7 @@ void ipoib_setup(struct net_device *dev) dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 10d0dba572c2..e90c6a7333d7 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -74,7 +74,7 @@ static struct net_device * __init ipddp_init(void) if (!dev) return ERR_PTR(-ENOMEM); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); strcpy(dev->name, "ipddp%d"); if (version_printed++ == 0) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3ad5413d4f57..c9ac06cfe6b7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1002,7 +1002,8 @@ static netdev_features_t bond_fix_features(struct net_device *dev, static void bond_compute_features(struct bonding *bond) { - unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | + IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; struct net_device *bond_dev = bond->dev; @@ -1038,8 +1039,10 @@ done: bond_dev->gso_max_segs = gso_max_segs; netif_set_gso_max_size(bond_dev, gso_max_size); - flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE; - bond_dev->priv_flags = flags | dst_release_flag; + bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } diff --git a/drivers/net/eql.c b/drivers/net/eql.c index 957e5c0cede3..a10ad74cc8d2 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -199,7 +199,7 @@ static void __init eql_setup(struct net_device *dev) dev->type = ARPHRD_SLIP; dev->tx_queue_len = 5; /* Hands them off fast */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int eql_open(struct net_device *dev) diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index d2d4a3d2237f..34f846b4bd05 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -185,7 +185,8 @@ static void ifb_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); eth_hw_addr_random(dev); } diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 8f2262540561..c76283c2f84a 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -169,7 +169,7 @@ static void loopback_setup(struct net_device *dev) dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->hw_features = NETIF_F_ALL_TSO | NETIF_F_UFO; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index e8a453f1b458..38b4fae61f04 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1025,7 +1025,8 @@ void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->netdev_ops = &macvlan_netdev_ops; dev->destructor = free_netdev; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index fa0d71727894..80e6f3430f65 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1103,7 +1103,7 @@ static void ppp_setup(struct net_device *dev) dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /* diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 2277c3679a51..a94a9df3e6bd 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -970,7 +970,8 @@ static void __team_compute_features(struct team *team) struct team_port *port; u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; unsigned short max_hard_header_len = ETH_HLEN; - unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | + IFF_XMIT_DST_RELEASE_PERM; list_for_each_entry(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, @@ -985,8 +986,9 @@ static void __team_compute_features(struct team *team) team->dev->vlan_features = vlan_features; team->dev->hard_header_len = max_hard_header_len; - flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE; - team->dev->priv_flags = flags | dst_release_flag; + team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(team->dev); } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2af795d6ba05..2a51e6e48e1e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2193,7 +2193,7 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; INIT_LIST_HEAD(&vxlan->next); diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index e5c7e6165a4b..3ebed1c40abb 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -1047,7 +1047,7 @@ static void pvc_setup(struct net_device *dev) dev->flags = IFF_POINTOPOINT; dev->hard_header_len = 10; dev->addr_len = 2; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static const struct net_device_ops pvc_ops = { diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index f8427a2c4840..afebb9709763 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3306,7 +3306,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) card->dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER; - card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(card->dev); card->dev->gso_max_size = 15 * PAGE_SIZE; SET_NETDEV_DEV(card->dev, &card->gdev->dev); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2df86f50261c..3a4315b39d20 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1206,6 +1206,7 @@ enum netdev_priv_flags { IFF_SUPP_NOFCS = 1<<19, IFF_LIVE_ADDR_CHANGE = 1<<20, IFF_MACVLAN = 1<<21, + IFF_XMIT_DST_RELEASE_PERM = 1<<22, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1230,6 +1231,7 @@ enum netdev_priv_flags { #define IFF_SUPP_NOFCS IFF_SUPP_NOFCS #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN +#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM /** * struct net_device - The DEVICE structure. @@ -3588,6 +3590,12 @@ static inline bool netif_supports_nofcs(struct net_device *dev) return dev->priv_flags & IFF_SUPP_NOFCS; } +/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ +static inline void netif_keep_dst(struct net_device *dev) +{ + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); +} + extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 35a6b6b15e8a..0d441ec8763e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; diff --git a/net/atm/clip.c b/net/atm/clip.c index 1d9eaa4f041a..17e55dfecbe2 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev) /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int clip_create(int number) diff --git a/net/core/dev.c b/net/core/dev.c index a63b8c43c1b6..3c5bdaa44486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (skb->next) return skb; - /* If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); - features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); @@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); dev->num_tx_queues = txqs; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0485ef18d254..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e453cb724a95..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ea88ab3102a8..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74b677916a70..de3b1c86b8d3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int ip6gre_tunnel_init(struct net_device *dev) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d3e8888ad611..9409887fb664 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5833a2244467..d440bb585524 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /** diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0d4e27466f82..6eab37cf5345 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a5d2b20db560..4ac515f2a6ce 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, tcf_exts_change(tp, &fnew->exts, &e); tcf_em_tree_change(tp, &fnew->ematches, &t); + netif_keep_dst(qdisc_dev(tp->q)); + if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; fnew->nkeys = nkeys; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f22baae0afa..109a329b7198 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f->handle < f1->handle) break; + netif_keep_dst(qdisc_dev(tp->q)); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2b349a4de3c8..38d58e6cef07 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops); static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - skb_dst_force(skb); q->gso_skb = skb; q->qstats.requeues++; q->q.qlen++; /* it's still part of the queue */ @@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q) if (unlikely(!skb)) return 0; - WARN_ON_ONCE(skb_dst_is_noref(skb)); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 5cd291bd00e4..6ada42396a24 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev) dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static LIST_HEAD(master_dev_list); -- cgit v1.2.3 From 908344cdda8039dd5c291e8a1ddd49649dff8c4b Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 7 Oct 2014 14:12:34 -0400 Subject: tipc: fix bug in multicast congestion handling One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc: use pseudo message to wake up sockets after link congestion") was to handle link congestion abatement in a uniform way for both unicast and multicast transmit. However, the latter doesn't work correctly, and has been broken since the referenced commit was applied. If a user now sends a burst of multicast messages that is big enough to cause broadcast link congestion, it will be put to sleep, and not be waked up when the congestion abates as it should be. This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS, is set correctly, but in the wrong field. Instead of setting it in the 'action_flags' field of the arrival node struct, it is by mistake set in the dummy node struct that is owned by the broadcast link, where it will never tested for. Second, we cannot use the same flag for waking up unicast and multicast users, since the function tipc_node_unlock() needs to pick the wakeup pseudo messages to deliver from different queues. It must hence be able to distinguish between the two cases. This commit solves this problem by adding a new flag TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user(). The latter is to be called by tipc_node_unlock() when the named flag, now set in the correct field, is encountered. v2: using explicit 'unsigned int' declaration instead of 'uint', as per comment from David Miller. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 14 +++++++++++++- net/tipc/bcast.h | 2 +- net/tipc/node.c | 5 +++++ net/tipc/node.h | 3 ++- 4 files changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index b2bbe69b2554..b8670bf262e2 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -225,6 +225,17 @@ static void bclink_retransmit_pkt(u32 after, u32 to) tipc_link_retransmit(bcl, buf, mod(to - after)); } +/** + * tipc_bclink_wakeup_users - wake up pending users + * + * Called with no locks taken + */ +void tipc_bclink_wakeup_users(void) +{ + while (skb_queue_len(&bclink->link.waiting_sks)) + tipc_sk_rcv(skb_dequeue(&bclink->link.waiting_sks)); +} + /** * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets * @n_ptr: node that sent acknowledgement info @@ -300,7 +311,8 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) bclink_set_last_sent(); } if (unlikely(released && !skb_queue_empty(&bcl->waiting_sks))) - bclink->node.action_flags |= TIPC_WAKEUP_USERS; + n_ptr->action_flags |= TIPC_WAKEUP_BCAST_USERS; + exit: tipc_bclink_unlock(); } diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 4875d9536aee..e7b0f85a82bc 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -99,5 +99,5 @@ int tipc_bclink_set_queue_limits(u32 limit); void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action); uint tipc_bclink_get_mtu(void); int tipc_bclink_xmit(struct sk_buff *buf); - +void tipc_bclink_wakeup_users(void); #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index 17e6378c4dfe..90cee4a6fce4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -552,6 +552,7 @@ void tipc_node_unlock(struct tipc_node *node) LIST_HEAD(conn_sks); struct sk_buff_head waiting_sks; u32 addr = 0; + unsigned int flags = node->action_flags; if (likely(!node->action_flags)) { spin_unlock_bh(&node->lock); @@ -572,6 +573,7 @@ void tipc_node_unlock(struct tipc_node *node) node->action_flags &= ~TIPC_NOTIFY_NODE_UP; addr = node->addr; } + node->action_flags &= ~TIPC_WAKEUP_BCAST_USERS; spin_unlock_bh(&node->lock); while (!skb_queue_empty(&waiting_sks)) @@ -583,6 +585,9 @@ void tipc_node_unlock(struct tipc_node *node) if (!list_empty(&nsub_list)) tipc_nodesub_notify(&nsub_list); + if (flags & TIPC_WAKEUP_BCAST_USERS) + tipc_bclink_wakeup_users(); + if (addr) tipc_named_node_up(addr); } diff --git a/net/tipc/node.h b/net/tipc/node.h index 522d6f3157b3..67513c3c852c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -59,7 +59,8 @@ enum { TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), - TIPC_WAKEUP_USERS = (1 << 5) + TIPC_WAKEUP_USERS = (1 << 5), + TIPC_WAKEUP_BCAST_USERS = (1 << 6) }; /** -- cgit v1.2.3 From 93fdd47e52f3f869a437319db9da1ea409acc07e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 5 Oct 2014 12:00:22 +0800 Subject: bridge: Save frag_max_size between PRE_ROUTING and POST_ROUTING As we may defragment the packet in IPv4 PRE_ROUTING and refragment it after POST_ROUTING we should save the value of frag_max_size. This is still very wrong as the bridge is supposed to leave the packets intact, meaning that the right thing to do is to use the original frag_list for fragmentation. Unfortunately we don't currently guarantee that the frag_list is left untouched throughout netfilter so until this changes this is the best we can do. There is also a spot in FORWARD where it appears that we can forward a packet without going through fragmentation, mark it so that we can fix it later. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 11 +++++++++++ net/bridge/br_private.h | 4 ++++ 2 files changed, 15 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a615264cf01a..4063898cf8aa 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -404,6 +404,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->mask |= BRNF_BRIDGED_DNAT; + /* FIXME Need to refragment */ ret = neigh->output(neigh, skb); } neigh_release(neigh); @@ -459,6 +460,10 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct rtable *rt; int err; + int frag_max_size; + + frag_max_size = IPCB(skb)->frag_max_size; + BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; @@ -863,13 +868,19 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, static int br_nf_dev_queue_xmit(struct sk_buff *skb) { int ret; + int frag_max_size; + /* This is wrong! We should preserve the original fragment + * boundaries by preserving frag_list rather than refragmenting. + */ if (skb->protocol == htons(ETH_P_IP) && skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && !skb_is_gso(skb)) { + frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; if (br_parse_ip_options(skb)) /* Drop invalid packet */ return NF_DROP; + IPCB(skb)->frag_max_size = frag_max_size; ret = ip_fragment(skb, br_dev_queue_push_xmit); } else ret = br_dev_queue_push_xmit(skb); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b6c04cbcfdc5..2398369c6dda 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -305,10 +305,14 @@ struct net_bridge struct br_input_skb_cb { struct net_device *brdev; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING int igmp; int mrouters_only; #endif + + u16 frag_max_size; + #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; #endif -- cgit v1.2.3 From ea85a0a2dc7263de733b7d1d13a433d35fcf36f7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 7 Oct 2014 16:33:53 -0400 Subject: ipv6: Do not warn for informational ICMP messages, regardless of type. There is no reason to emit a log message for these. Based upon a suggestion from Hannes Frederic Sowa. Signed-off-by: David S. Miller Acked-by: Hannes Frederic Sowa --- net/ipv6/icmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 141e1f3ab74e..97ae70077a4f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -777,12 +777,12 @@ static int icmpv6_rcv(struct sk_buff *skb) break; default: - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); - /* informational */ if (type & ICMPV6_INFOMSG_MASK) break; + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); + /* * error of unknown type. * must pass to upper level -- cgit v1.2.3 From 505e907db388185649d93925c9975d0a0704ea64 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 7 Oct 2014 23:02:15 +0200 Subject: af_unix: remove 0 assignment on static static values are automatically initialized to 0 Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/unix/garbage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 9bc73f87f64a..99f7012b23b9 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -258,7 +258,7 @@ static void inc_inflight_move_tail(struct unix_sock *u) list_move_tail(&u->link, &gc_candidates); } -static bool gc_in_progress = false; +static bool gc_in_progress; #define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) -- cgit v1.2.3 From 28b7deae75642c51f097391765fd39ff0dd6ce95 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 7 Oct 2014 22:12:03 +0200 Subject: wimax: convert printk to pr_foo() Use current logging functions and add module name prefix. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/wimax/id-table.c | 2 +- net/wimax/op-msg.c | 9 ++++----- net/wimax/op-reset.c | 3 +-- net/wimax/op-rfkill.c | 3 +-- net/wimax/op-state-get.c | 3 +-- net/wimax/stack.c | 7 +++---- net/wimax/wimax-internal.h | 6 ++++++ 7 files changed, 17 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c index 72273abfcb16..a21508d11036 100644 --- a/net/wimax/id-table.c +++ b/net/wimax/id-table.c @@ -137,7 +137,7 @@ void wimax_id_table_release(void) #endif spin_lock(&wimax_id_table_lock); list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { - printk(KERN_ERR "BUG: %s wimax_dev %p ifindex %d not cleared\n", + pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n", __func__, wimax_dev, wimax_dev->net_dev->ifindex); WARN_ON(1); } diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c index c278b3356f75..54aa146930bd 100644 --- a/net/wimax/op-msg.c +++ b/net/wimax/op-msg.c @@ -189,7 +189,7 @@ const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size) nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), WIMAX_GNL_MSG_DATA); if (nla == NULL) { - printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); return NULL; } *size = nla_len(nla); @@ -211,7 +211,7 @@ const void *wimax_msg_data(struct sk_buff *msg) nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), WIMAX_GNL_MSG_DATA); if (nla == NULL) { - printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); return NULL; } return nla_data(nla); @@ -232,7 +232,7 @@ ssize_t wimax_msg_len(struct sk_buff *msg) nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), WIMAX_GNL_MSG_DATA); if (nla == NULL) { - printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); return -EINVAL; } return nla_len(nla); @@ -343,8 +343,7 @@ int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_MSG_FROM_USER: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]); diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c index eb4580784d9d..a42079165e1f 100644 --- a/net/wimax/op-reset.c +++ b/net/wimax/op-reset.c @@ -107,8 +107,7 @@ int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]); diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c index 403078d670a9..7d730543f243 100644 --- a/net/wimax/op-rfkill.c +++ b/net/wimax/op-rfkill.c @@ -421,8 +421,7 @@ int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]); diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c index 995c08c827b5..e6788d281d0e 100644 --- a/net/wimax/op-state-get.c +++ b/net/wimax/op-state-get.c @@ -49,8 +49,7 @@ int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_OP_STATE_GET: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]); diff --git a/net/wimax/stack.c b/net/wimax/stack.c index ec8b577db135..3f816e2971ee 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -191,8 +191,8 @@ void __check_new_state(enum wimax_st old_state, enum wimax_st new_state, unsigned int allowed_states_bm) { if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) { - printk(KERN_ERR "SW BUG! Forbidden state change %u -> %u\n", - old_state, new_state); + pr_err("SW BUG! Forbidden state change %u -> %u\n", + old_state, new_state); } } @@ -602,8 +602,7 @@ int __init wimax_subsys_init(void) wimax_gnl_ops, wimax_gnl_mcgrps); if (unlikely(result < 0)) { - printk(KERN_ERR "cannot register generic netlink family: %d\n", - result); + pr_err("cannot register generic netlink family: %d\n", result); goto error_register_family; } diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h index b445b82020a8..733c4bf8d4b3 100644 --- a/net/wimax/wimax-internal.h +++ b/net/wimax/wimax-internal.h @@ -30,6 +30,12 @@ #define __WIMAX_INTERNAL_H__ #ifdef __KERNEL__ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include -- cgit v1.2.3 From 5301e3e117d88ef0967ce278912e54757f1a31a2 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 6 Oct 2014 17:21:54 -0700 Subject: net_sched: copy exts->type in tcf_exts_change() We need to copy exts->type when committing the change, otherwise it would be always 0. This is a quick fix for -net and -stable, for net-next tcf_exts will be removed. Fixes: commit 33be627159913b094bb578e83 ("net_sched: act: use standard struct list_head") Reported-by: Jamal Hadi Salim Cc: Jamal Hadi Salim Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c28b0d327b12..4f4e08b0e2b7 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -549,6 +549,7 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, tcf_tree_lock(tp); list_splice_init(&dst->actions, &tmp); list_splice(&src->actions, &dst->actions); + dst->type = src->type; tcf_tree_unlock(tp); tcf_action_destroy(&tmp, TCA_ACT_UNBIND); #endif -- cgit v1.2.3 From 16b99a4f6644d58c94acb4b4253e84049de588c5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 8 Oct 2014 20:37:01 +0200 Subject: netlabel: directly return netlbl_unlabel_genl_init() No need to store netlbl_unlabel_genl_init result and test it before returning. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/netlabel/netlabel_user.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 1e779bb7fa43..adf8b7900da2 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -71,11 +71,7 @@ int __init netlbl_netlink_init(void) if (ret_val != 0) return ret_val; - ret_val = netlbl_unlabel_genl_init(); - if (ret_val != 0) - return ret_val; - - return 0; + return netlbl_unlabel_genl_init(); } /* -- cgit v1.2.3