diff options
Diffstat (limited to 'net')
124 files changed, 2112 insertions, 1544 deletions
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig index 028a5c6d1f61..e4a02ef55102 100644 --- a/net/6lowpan/Kconfig +++ b/net/6lowpan/Kconfig @@ -1,5 +1,5 @@ config 6LOWPAN - bool "6LoWPAN Support" + tristate "6LoWPAN Support" depends on IPV6 ---help--- This enables IPv6 over Low power Wireless Personal Area Network - diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 75d427763992..90cc2bdd4064 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -112,59 +112,6 @@ __be16 vlan_dev_vlan_proto(const struct net_device *dev) } EXPORT_SYMBOL(vlan_dev_vlan_proto); -static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) -{ - if (skb_cow(skb, skb_headroom(skb)) < 0) { - kfree_skb(skb); - return NULL; - } - - memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); - skb->mac_header += VLAN_HLEN; - return skb; -} - -struct sk_buff *vlan_untag(struct sk_buff *skb) -{ - struct vlan_hdr *vhdr; - u16 vlan_tci; - - if (unlikely(vlan_tx_tag_present(skb))) { - /* vlan_tci is already set-up so leave this for another time */ - return skb; - } - - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb)) - goto err_free; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) - goto err_free; - - vhdr = (struct vlan_hdr *) skb->data; - vlan_tci = ntohs(vhdr->h_vlan_TCI); - __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); - - skb_pull_rcsum(skb, VLAN_HLEN); - vlan_set_encap_proto(skb, vhdr); - - skb = vlan_reorder_header(skb); - if (unlikely(!skb)) - goto err_free; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb_reset_mac_len(skb); - - return skb; - -err_free: - kfree_skb(skb); - return NULL; -} -EXPORT_SYMBOL(vlan_untag); - - /* * vlan info and vid list */ diff --git a/net/atm/lec.c b/net/atm/lec.c index 4c5b8ba0f84f..4b98f897044a 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -410,9 +410,11 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; + rtnl_lock(); if (dev_set_mtu(dev, mesg->content.config.mtu)) pr_info("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); + rtnl_unlock(); priv->is_proxy = mesg->content.config.is_proxy; break; case l_flush_tran_id: @@ -833,7 +835,6 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, loff_t *l) { struct hlist_node *e = state->node; - struct lec_arp_table *tmp; if (!e) e = tbl->first; @@ -842,9 +843,7 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, --*l; } - tmp = container_of(e, struct lec_arp_table, next); - - hlist_for_each_entry_from(tmp, next) { + for (; e; e = e->next) { if (--*l < 0) break; } diff --git a/net/atm/svc.c b/net/atm/svc.c index d8e5d0c2ebbc..1ba23f5018e7 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -50,12 +50,12 @@ static void svc_disconnect(struct atm_vcc *vcc) pr_debug("%p\n", vcc); if (test_bit(ATM_VF_REGIS, &vcc->flags)) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq(vcc, as_close, NULL, NULL, NULL); - while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) + break; schedule(); - prepare_to_wait(sk_sleep(sk), &wait, - TASK_UNINTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); } @@ -126,11 +126,12 @@ static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, } vcc->local = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */ @@ -202,15 +203,14 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, } vcc->remote = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote); if (flags & O_NONBLOCK) { - finish_wait(sk_sleep(sk), &wait); sock->state = SS_CONNECTING; error = -EINPROGRESS; goto out; } error = 0; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { schedule(); if (!signal_pending(current)) { @@ -297,11 +297,12 @@ static int svc_listen(struct socket *sock, int backlog) goto out; } set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { @@ -387,15 +388,15 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags) } /* wait should be short, so we ignore the non-blocking flag */ set_bit(ATM_VF_WAITING, &new_vcc->flags); - prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, - TASK_UNINTERRUPTIBLE); sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL); - while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) { + for (;;) { + prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, + TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd) + break; release_sock(sk); schedule(); lock_sock(sk); - prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, - TASK_UNINTERRUPTIBLE); } finish_wait(sk_sleep(sk_atm(new_vcc)), &wait); if (!sigd) { @@ -433,12 +434,14 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) DEFINE_WAIT(wait); set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && - !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || + test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) { + break; + } + schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) @@ -529,18 +532,18 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sigd_enq(vcc, as_addparty, NULL, NULL, (struct sockaddr_atmsvc *) sockaddr); if (flags & O_NONBLOCK) { - finish_wait(sk_sleep(sk), &wait); error = -EINPROGRESS; goto out; } pr_debug("added wait queue\n"); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); error = xchg(&sk->sk_err_soft, 0); @@ -558,11 +561,12 @@ static int svc_dropparty(struct socket *sock, int ep_ref) lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 022d18ab27a6..fc1835c6bb40 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -188,7 +188,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { - hlist_add_after(&frag_entry_last->list, &frag_entry_new->list); + hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 96b66fd30f96..ab6bb2af1d45 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -20,7 +20,6 @@ #include "originator.h" #include "hard-interface.h" #include "translation-table.h" -#include "multicast.h" /** * batadv_mcast_mla_softif_get - get softif multicast listeners diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b50dabb3f86a..faff6247ac8f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -589,6 +589,14 @@ EXPORT_SYMBOL(hci_get_route); void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; + struct hci_conn_params *params; + + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, + conn->dst_type); + if (params && params->conn) { + hci_conn_drop(params->conn); + params->conn = NULL; + } conn->state = BT_CLOSED; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c32d361c0cf7..1d9c29a00568 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2536,8 +2536,13 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) { struct hci_conn_params *p; - list_for_each_entry(p, &hdev->le_conn_params, list) + list_for_each_entry(p, &hdev->le_conn_params, list) { + if (p->conn) { + hci_conn_drop(p->conn); + p->conn = NULL; + } list_del_init(&p->action); + } BT_DBG("All LE pending actions cleared"); } @@ -2578,8 +2583,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); - hci_conn_hash_flush(hdev); hci_pend_le_actions_clear(hdev); + hci_conn_hash_flush(hdev); hci_dev_unlock(hdev); hci_notify(hdev, HCI_DEV_DOWN); @@ -3727,6 +3732,9 @@ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) if (!params) return; + if (params->conn) + hci_conn_drop(params->conn); + list_del(¶ms->action); list_del(¶ms->list); kfree(params); @@ -3757,6 +3765,8 @@ void hci_conn_params_clear_all(struct hci_dev *hdev) struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { + if (params->conn) + hci_conn_drop(params->conn); list_del(¶ms->action); list_del(¶ms->list); kfree(params); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index be35598984d9..a6000823f0ff 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4221,8 +4221,13 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_proto_connect_cfm(conn, ev->status); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); - if (params) + if (params) { list_del_init(¶ms->action); + if (params->conn) { + hci_conn_drop(params->conn); + params->conn = NULL; + } + } unlock: hci_update_background_scan(hdev); @@ -4304,8 +4309,16 @@ static void check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER); - if (!IS_ERR(conn)) + if (!IS_ERR(conn)) { + /* Store the pointer since we don't really have any + * other owner of the object besides the params that + * triggered it. This way we can abort the connection if + * the parameters get removed and keep the reference + * count consistent once the connection is established. + */ + params->conn = conn; return; + } switch (PTR_ERR(conn)) { case -EBUSY: diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b4845f4b2bb4..7751c92c8c57 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1174,7 +1174,7 @@ static void br_multicast_add_router(struct net_bridge *br, } if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 62a7fa2e3569..b6c04cbcfdc5 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -309,6 +309,9 @@ struct br_input_skb_cb { int igmp; int mrouters_only; #endif +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + bool vlan_filtered; +#endif }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index febb0f87fa37..3ba57fcdcd13 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -27,9 +27,13 @@ static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) { if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(v, vid); + else + __vlan_delete_pvid(v, vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) set_bit(vid, v->untagged_bitmap); + else + clear_bit(vid, v->untagged_bitmap); } static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) @@ -125,7 +129,8 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, { u16 vid; - if (!br->vlan_enabled) + /* If this packet was not filtered at input, let it pass */ + if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* Vlan filter table must be configured at this point. The @@ -164,8 +169,10 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ - if (!br->vlan_enabled) + if (!br->vlan_enabled) { + BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; + } /* If there are no vlan in the permitted list, all packets are * rejected. @@ -173,6 +180,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, if (!v) goto drop; + BR_INPUT_SKB_CB(skb)->vlan_filtered = true; proto = br->vlan_proto; /* If vlan tx offload is disabled on bridge device and frame was @@ -181,7 +189,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, */ if (unlikely(!vlan_tx_tag_present(skb) && skb->protocol == proto)) { - skb = vlan_untag(skb); + skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } @@ -251,7 +259,8 @@ bool br_allowed_egress(struct net_bridge *br, { u16 vid; - if (!br->vlan_enabled) + /* If this packet was not filtered at input, let it pass */ + if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; if (!v) @@ -270,6 +279,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) struct net_bridge *br = p->br; struct net_port_vlans *v; + /* If filtering was disabled at input, let it pass. */ if (!br->vlan_enabled) return true; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 1059ed3bc255..6d69631b9f4d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -327,10 +327,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error, char name[EBT_FUNCTION_MAXNAMELEN]; } *e; - *error = mutex_lock_interruptible(mutex); - if (*error != 0) - return NULL; - + mutex_lock(mutex); list_for_each_entry(e, head, list) { if (strcmp(e->name, name) == 0) return e; @@ -1203,10 +1200,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table) table->private = newinfo; rwlock_init(&table->lock); - ret = mutex_lock_interruptible(&ebt_mutex); - if (ret != 0) - goto free_chainstack; - + mutex_lock(&ebt_mutex); list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 96238ba95f2b..de6662b14e1f 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -13,8 +13,6 @@ #include "auth_x.h" #include "auth_x_protocol.h" -#define TEMP_TICKET_BUF_LEN 256 - static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) @@ -64,7 +62,7 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, } static int ceph_x_decrypt(struct ceph_crypto_key *secret, - void **p, void *end, void *obuf, size_t olen) + void **p, void *end, void **obuf, size_t olen) { struct ceph_x_encrypt_header head; size_t head_len = sizeof(head); @@ -75,8 +73,14 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret, return -EINVAL; dout("ceph_x_decrypt len %d\n", len); - ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen, - *p, len); + if (*obuf == NULL) { + *obuf = kmalloc(len, GFP_NOFS); + if (!*obuf) + return -ENOMEM; + olen = len; + } + + ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); if (ret) return ret; if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) @@ -129,139 +133,120 @@ static void remove_ticket_handler(struct ceph_auth_client *ac, kfree(th); } -static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, - struct ceph_crypto_key *secret, - void *buf, void *end) +static int process_one_ticket(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void **p, void *end) { struct ceph_x_info *xi = ac->private; - int num; - void *p = buf; + int type; + u8 tkt_struct_v, blob_struct_v; + struct ceph_x_ticket_handler *th; + void *dbuf = NULL; + void *dp, *dend; + int dlen; + char is_enc; + struct timespec validity; + struct ceph_crypto_key old_key; + void *ticket_buf = NULL; + void *tp, *tpend; + struct ceph_timespec new_validity; + struct ceph_crypto_key new_session_key; + struct ceph_buffer *new_ticket_blob; + unsigned long new_expires, new_renew_after; + u64 new_secret_id; int ret; - char *dbuf; - char *ticket_buf; - u8 reply_struct_v; - dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); - if (!dbuf) - return -ENOMEM; + ceph_decode_need(p, end, sizeof(u32) + 1, bad); - ret = -ENOMEM; - ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); - if (!ticket_buf) - goto out_dbuf; + type = ceph_decode_32(p); + dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - reply_struct_v = ceph_decode_8(&p); - if (reply_struct_v != 1) + tkt_struct_v = ceph_decode_8(p); + if (tkt_struct_v != 1) goto bad; - num = ceph_decode_32(&p); - dout("%d tickets\n", num); - while (num--) { - int type; - u8 tkt_struct_v, blob_struct_v; - struct ceph_x_ticket_handler *th; - void *dp, *dend; - int dlen; - char is_enc; - struct timespec validity; - struct ceph_crypto_key old_key; - void *tp, *tpend; - struct ceph_timespec new_validity; - struct ceph_crypto_key new_session_key; - struct ceph_buffer *new_ticket_blob; - unsigned long new_expires, new_renew_after; - u64 new_secret_id; - - ceph_decode_need(&p, end, sizeof(u32) + 1, bad); - - type = ceph_decode_32(&p); - dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - - tkt_struct_v = ceph_decode_8(&p); - if (tkt_struct_v != 1) - goto bad; - - th = get_ticket_handler(ac, type); - if (IS_ERR(th)) { - ret = PTR_ERR(th); - goto out; - } - /* blob for me */ - dlen = ceph_x_decrypt(secret, &p, end, dbuf, - TEMP_TICKET_BUF_LEN); - if (dlen <= 0) { - ret = dlen; - goto out; - } - dout(" decrypted %d bytes\n", dlen); - dend = dbuf + dlen; - dp = dbuf; + th = get_ticket_handler(ac, type); + if (IS_ERR(th)) { + ret = PTR_ERR(th); + goto out; + } - tkt_struct_v = ceph_decode_8(&dp); - if (tkt_struct_v != 1) - goto bad; + /* blob for me */ + dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); + if (dlen <= 0) { + ret = dlen; + goto out; + } + dout(" decrypted %d bytes\n", dlen); + dp = dbuf; + dend = dp + dlen; - memcpy(&old_key, &th->session_key, sizeof(old_key)); - ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); - if (ret) - goto out; + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) + goto bad; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); - new_expires = get_seconds() + validity.tv_sec; - new_renew_after = new_expires - (validity.tv_sec / 4); - dout(" expires=%lu renew_after=%lu\n", new_expires, - new_renew_after); + memcpy(&old_key, &th->session_key, sizeof(old_key)); + ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); + if (ret) + goto out; - /* ticket blob for service */ - ceph_decode_8_safe(&p, end, is_enc, bad); - tp = ticket_buf; - if (is_enc) { - /* encrypted */ - dout(" encrypted ticket\n"); - dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf, - TEMP_TICKET_BUF_LEN); - if (dlen < 0) { - ret = dlen; - goto out; - } - dlen = ceph_decode_32(&tp); - } else { - /* unencrypted */ - ceph_decode_32_safe(&p, end, dlen, bad); - ceph_decode_need(&p, end, dlen, bad); - ceph_decode_copy(&p, ticket_buf, dlen); + ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); + ceph_decode_timespec(&validity, &new_validity); + new_expires = get_seconds() + validity.tv_sec; + new_renew_after = new_expires - (validity.tv_sec / 4); + dout(" expires=%lu renew_after=%lu\n", new_expires, + new_renew_after); + + /* ticket blob for service */ + ceph_decode_8_safe(p, end, is_enc, bad); + if (is_enc) { + /* encrypted */ + dout(" encrypted ticket\n"); + dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); + if (dlen < 0) { + ret = dlen; + goto out; } - tpend = tp + dlen; - dout(" ticket blob is %d bytes\n", dlen); - ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - blob_struct_v = ceph_decode_8(&tp); - new_secret_id = ceph_decode_64(&tp); - ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); - if (ret) + tp = ticket_buf; + dlen = ceph_decode_32(&tp); + } else { + /* unencrypted */ + ceph_decode_32_safe(p, end, dlen, bad); + ticket_buf = kmalloc(dlen, GFP_NOFS); + if (!ticket_buf) { + ret = -ENOMEM; goto out; - - /* all is well, update our ticket */ - ceph_crypto_key_destroy(&th->session_key); - if (th->ticket_blob) - ceph_buffer_put(th->ticket_blob); - th->session_key = new_session_key; - th->ticket_blob = new_ticket_blob; - th->validity = new_validity; - th->secret_id = new_secret_id; - th->expires = new_expires; - th->renew_after = new_renew_after; - dout(" got ticket service %d (%s) secret_id %lld len %d\n", - type, ceph_entity_type_name(type), th->secret_id, - (int)th->ticket_blob->vec.iov_len); - xi->have_keys |= th->service; + } + tp = ticket_buf; + ceph_decode_need(p, end, dlen, bad); + ceph_decode_copy(p, ticket_buf, dlen); } + tpend = tp + dlen; + dout(" ticket blob is %d bytes\n", dlen); + ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); + blob_struct_v = ceph_decode_8(&tp); + new_secret_id = ceph_decode_64(&tp); + ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); + if (ret) + goto out; + + /* all is well, update our ticket */ + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + th->session_key = new_session_key; + th->ticket_blob = new_ticket_blob; + th->validity = new_validity; + th->secret_id = new_secret_id; + th->expires = new_expires; + th->renew_after = new_renew_after; + dout(" got ticket service %d (%s) secret_id %lld len %d\n", + type, ceph_entity_type_name(type), th->secret_id, + (int)th->ticket_blob->vec.iov_len); + xi->have_keys |= th->service; - ret = 0; out: kfree(ticket_buf); -out_dbuf: kfree(dbuf); return ret; @@ -270,6 +255,34 @@ bad: goto out; } +static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void *buf, void *end) +{ + void *p = buf; + u8 reply_struct_v; + u32 num; + int ret; + + ceph_decode_8_safe(&p, end, reply_struct_v, bad); + if (reply_struct_v != 1) + return -EINVAL; + + ceph_decode_32_safe(&p, end, num, bad); + dout("%d tickets\n", num); + + while (num--) { + ret = process_one_ticket(ac, secret, &p, end); + if (ret) + return ret; + } + + return 0; + +bad: + return -EINVAL; +} + static int ceph_x_build_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th, struct ceph_x_authorizer *au) @@ -583,13 +596,14 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th; int ret = 0; struct ceph_x_authorize_reply reply; + void *preply = &reply; void *p = au->reply_buf; void *end = p + sizeof(au->reply_buf); th = get_ticket_handler(ac, au->service); if (IS_ERR(th)) return PTR_ERR(th); - ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); + ret = ceph_x_decrypt(&th->session_key, &p, end, &preply, sizeof(reply)); if (ret < 0) return ret; if (ret != sizeof(reply)) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1948d592aa54..b2f571dd933d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -174,6 +174,7 @@ static struct lock_class_key socket_class; #define SKIP_BUF_SIZE 1024 static void queue_con(struct ceph_connection *con); +static void cancel_con(struct ceph_connection *con); static void con_work(struct work_struct *); static void con_fault(struct ceph_connection *con); @@ -680,7 +681,7 @@ void ceph_con_close(struct ceph_connection *con) reset_connection(con); con->peer_global_seq = 0; - cancel_delayed_work(&con->work); + cancel_con(con); con_close_socket(con); mutex_unlock(&con->mutex); } @@ -900,7 +901,7 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); - cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; + cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; } static struct page * @@ -2667,19 +2668,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { dout("%s %p ref count 0\n", __func__, con); - return -ENOENT; } if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); - return -EBUSY; } dout("%s %p %lu\n", __func__, con, delay); - return 0; } @@ -2688,6 +2686,14 @@ static void queue_con(struct ceph_connection *con) (void) queue_con_delay(con, 0); } +static void cancel_con(struct ceph_connection *con) +{ + if (cancel_delayed_work(&con->work)) { + dout("%s %p\n", __func__, con); + con->ops->put(con); + } +} + static bool con_sock_closed(struct ceph_connection *con) { if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) @@ -3269,24 +3275,21 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) /* * Free a generically kmalloc'd message. */ -void ceph_msg_kfree(struct ceph_msg *m) +static void ceph_msg_free(struct ceph_msg *m) { - dout("msg_kfree %p\n", m); + dout("%s %p\n", __func__, m); ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } -/* - * Drop a msg ref. Destroy as needed. - */ -void ceph_msg_last_put(struct kref *kref) +static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); LIST_HEAD(data); struct list_head *links; struct list_head *next; - dout("ceph_msg_put last one on %p\n", m); + dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); /* drop middle, data, if any */ @@ -3308,9 +3311,25 @@ void ceph_msg_last_put(struct kref *kref) if (m->pool) ceph_msgpool_put(m->pool, m); else - ceph_msg_kfree(m); + ceph_msg_free(m); +} + +struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) +{ + dout("%s %p (was %d)\n", __func__, msg, + atomic_read(&msg->kref.refcount)); + kref_get(&msg->kref); + return msg; +} +EXPORT_SYMBOL(ceph_msg_get); + +void ceph_msg_put(struct ceph_msg *msg) +{ + dout("%s %p (was %d)\n", __func__, msg, + atomic_read(&msg->kref.refcount)); + kref_put(&msg->kref, ceph_msg_release); } -EXPORT_SYMBOL(ceph_msg_last_put); +EXPORT_SYMBOL(ceph_msg_put); void ceph_msg_dump(struct ceph_msg *msg) { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 067d3af2eaf6..61fcfc304f68 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1181,7 +1181,15 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, if (!m) { pr_info("alloc_msg unknown type %d\n", type); *skip = 1; + } else if (front_len > m->front_alloc_len) { + pr_warning("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n", + front_len, m->front_alloc_len, + (unsigned int)con->peer_name.type, + le64_to_cpu(con->peer_name.num)); + ceph_msg_put(m); + m = ceph_msg_new(type, front_len, GFP_NOFS, false); } + return m; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 05be0c181695..30f6faf3584f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -297,12 +297,21 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, /* * requests */ -void ceph_osdc_release_request(struct kref *kref) +static void ceph_osdc_release_request(struct kref *kref) { - struct ceph_osd_request *req; + struct ceph_osd_request *req = container_of(kref, + struct ceph_osd_request, r_kref); unsigned int which; - req = container_of(kref, struct ceph_osd_request, r_kref); + dout("%s %p (r_request %p r_reply %p)\n", __func__, req, + req->r_request, req->r_reply); + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!list_empty(&req->r_req_lru_item)); + WARN_ON(!list_empty(&req->r_osd_item)); + WARN_ON(!list_empty(&req->r_linger_item)); + WARN_ON(!list_empty(&req->r_linger_osd_item)); + WARN_ON(req->r_osd); + if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) { @@ -320,7 +329,22 @@ void ceph_osdc_release_request(struct kref *kref) kmem_cache_free(ceph_osd_request_cache, req); } -EXPORT_SYMBOL(ceph_osdc_release_request); + +void ceph_osdc_get_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_get(&req->r_kref); +} +EXPORT_SYMBOL(ceph_osdc_get_request); + +void ceph_osdc_put_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_put(&req->r_kref, ceph_osdc_release_request); +} +EXPORT_SYMBOL(ceph_osdc_put_request); struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, @@ -364,7 +388,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); - INIT_LIST_HEAD(&req->r_linger_osd); + INIT_LIST_HEAD(&req->r_linger_osd_item); INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); @@ -916,7 +940,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, * list at the end to keep things in tid order. */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, - r_linger_osd) { + r_linger_osd_item) { /* * reregister request prior to unregistering linger so * that r_osd is preserved. @@ -1008,6 +1032,8 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { dout("__remove_osd %p\n", osd); BUG_ON(!list_empty(&osd->o_requests)); + BUG_ON(!list_empty(&osd->o_linger_requests)); + rb_erase(&osd->o_node, &osdc->osds); list_del_init(&osd->o_osd_lru); ceph_con_close(&osd->o_con); @@ -1029,12 +1055,23 @@ static void remove_all_osds(struct ceph_osd_client *osdc) static void __move_osd_to_lru(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - dout("__move_osd_to_lru %p\n", osd); + dout("%s %p\n", __func__, osd); BUG_ON(!list_empty(&osd->o_osd_lru)); + list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; } +static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("%s %p\n", __func__, osd); + + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) + __move_osd_to_lru(osdc, osd); +} + static void __remove_osd_from_lru(struct ceph_osd *osd) { dout("__remove_osd_from_lru %p\n", osd); @@ -1175,6 +1212,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &osdc->requests); + RB_CLEAR_NODE(&req->r_node); osdc->num_requests--; if (req->r_osd) { @@ -1182,12 +1220,8 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests) && - list_empty(&req->r_osd->o_linger_requests)) { - dout("moving osd to %p lru\n", req->r_osd); - __move_osd_to_lru(osdc, req->r_osd); - } - if (list_empty(&req->r_linger_item)) + maybe_move_osd_to_lru(osdc, req->r_osd); + if (list_empty(&req->r_linger_osd_item)) req->r_osd = NULL; } @@ -1214,45 +1248,39 @@ static void __cancel_request(struct ceph_osd_request *req) static void __register_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - dout("__register_linger_request %p\n", req); + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + WARN_ON(!req->r_linger); + ceph_osdc_get_request(req); list_add_tail(&req->r_linger_item, &osdc->req_linger); if (req->r_osd) - list_add_tail(&req->r_linger_osd, + list_add_tail(&req->r_linger_osd_item, &req->r_osd->o_linger_requests); } static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - dout("__unregister_linger_request %p\n", req); + WARN_ON(!req->r_linger); + + if (list_empty(&req->r_linger_item)) { + dout("%s %p tid %llu not registered\n", __func__, req, + req->r_tid); + return; + } + + dout("%s %p tid %llu\n", __func__, req, req->r_tid); list_del_init(&req->r_linger_item); - if (req->r_osd) { - list_del_init(&req->r_linger_osd); - if (list_empty(&req->r_osd->o_requests) && - list_empty(&req->r_osd->o_linger_requests)) { - dout("moving osd to %p lru\n", req->r_osd); - __move_osd_to_lru(osdc, req->r_osd); - } + if (req->r_osd) { + list_del_init(&req->r_linger_osd_item); + maybe_move_osd_to_lru(osdc, req->r_osd); if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } ceph_osdc_put_request(req); } -void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - if (req->r_linger) { - req->r_linger = 0; - __unregister_linger_request(osdc, req); - } - mutex_unlock(&osdc->request_mutex); -} -EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); - void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { @@ -2430,6 +2458,25 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, EXPORT_SYMBOL(ceph_osdc_start_request); /* + * Unregister a registered request. The request is not completed (i.e. + * no callbacks or wakeups) - higher layers are supposed to know what + * they are canceling. + */ +void ceph_osdc_cancel_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + + mutex_lock(&osdc->request_mutex); + if (req->r_linger) + __unregister_linger_request(osdc, req); + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + + dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); +} +EXPORT_SYMBOL(ceph_osdc_cancel_request); + +/* * wait for a request to complete */ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, @@ -2437,18 +2484,18 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, { int rc; + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + rc = wait_for_completion_interruptible(&req->r_completion); if (rc < 0) { - mutex_lock(&osdc->request_mutex); - __cancel_request(req); - __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); + dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); + ceph_osdc_cancel_request(req); complete_request(req); - dout("wait_request tid %llu canceled/timed out\n", req->r_tid); return rc; } - dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); + dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, + req->r_result); return req->r_result; } EXPORT_SYMBOL(ceph_osdc_wait_request); diff --git a/net/core/Makefile b/net/core/Makefile index 71093d94ad2b..235e6c50708d 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o -obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 488dd1a825c0..fdbc9a81d4c2 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -775,7 +775,7 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) EXPORT_SYMBOL(__skb_checksum_complete); /** - * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length * @iov: io vector diff --git a/net/core/dev.c b/net/core/dev.c index 1c15b189c52b..130d64220229 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,7 +1284,6 @@ static int __dev_open(struct net_device *dev) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; - net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1363,7 +1362,6 @@ static int __dev_close_many(struct list_head *head) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; - net_dmaengine_put(); netpoll_poll_enable(dev); } @@ -2587,13 +2585,19 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) return harmonize_features(skb, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + features = netdev_intersect_features(features, + skb->dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) - features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; + features = netdev_intersect_features(features, + NETIF_F_SG | + NETIF_F_HIGHDMA | + NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); return harmonize_features(skb, features); } @@ -3602,7 +3606,7 @@ another_round: if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { - skb = vlan_untag(skb); + skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto unlock; } @@ -4499,14 +4503,6 @@ static void net_rx_action(struct softirq_action *h) out: net_rps_action_and_irq_enable(sd); -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif - return; softnet_break: @@ -4803,9 +4799,14 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev, sysfs_remove_link(&(dev->dev.kobj), linkname); } -#define netdev_adjacent_is_neigh_list(dev, dev_list) \ - (dev_list == &dev->adj_list.upper || \ - dev_list == &dev->adj_list.lower) +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, @@ -4835,7 +4836,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, pr_debug("dev_hold for %s, because of link added from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); - if (netdev_adjacent_is_neigh_list(dev, dev_list)) { + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; @@ -4856,7 +4857,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; remove_symlinks: - if (netdev_adjacent_is_neigh_list(dev, dev_list)) + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: kfree(adj); @@ -4889,7 +4890,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); - if (netdev_adjacent_is_neigh_list(dev, dev_list)) + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); @@ -5159,11 +5160,65 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; + struct net *net = dev_net(dev); + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, @@ -5171,6 +5226,8 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) } list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, @@ -6773,6 +6830,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -6787,6 +6845,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 6b5b6e7013ca..9d33dfffca19 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -197,7 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * as destination. A new timer with the interval specified in the * configuration TLV is created. Upon each interval, the latest statistics * will be read from &bstats and the estimated rate will be stored in - * &rate_est with the statistics lock grabed during this period. + * &rate_est with the statistics lock grabbed during this period. * * Returns 0 on success or a negative error code. * diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 9d3d9e78397b..2ddbce4cce14 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -206,7 +206,7 @@ EXPORT_SYMBOL(gnet_stats_copy_queue); * @st: application specific statistics data * @len: length of data * - * Appends the application sepecific statistics to the top level TLV created by + * Appends the application specific statistics to the top level TLV created by * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping * handle is in backward compatibility mode. * diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 85b62691f4f2..7c6b51a58968 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid) tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; - nsproxy = task_nsproxy(tsk); + task_lock(tsk); + nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); + task_unlock(tsk); } rcu_read_unlock(); return net; @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task) struct net *net = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); - rcu_read_unlock(); + task_unlock(task); return net; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8d39071f32d7..f0493e3b7471 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -804,7 +804,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, (nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + - nla_total_size(sizeof(struct ifla_vf_rate))); + nla_total_size(sizeof(struct ifla_vf_rate)) + + nla_total_size(sizeof(struct ifla_vf_link_state))); return size; } else return 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 224506a6fa80..8d289697cc7a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -62,6 +62,7 @@ #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> +#include <linux/if_vlan.h> #include <net/protocol.h> #include <net/dst.h> @@ -2646,7 +2647,7 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * skb_seq_read() will return the remaining part of the block. * * Note 1: The size of each block of data returned can be arbitrary, - * this limitation is the cost for zerocopy seqeuental + * this limitation is the cost for zerocopy sequential * reads of potentially non linear data. * * Note 2: Fragment lists within fragments are not implemented @@ -2780,7 +2781,7 @@ EXPORT_SYMBOL(skb_find_text); /** * skb_append_datato_frags - append the user data to a skb * @sk: sock structure - * @skb: skb structure to be appened with user data. + * @skb: skb structure to be appended with user data. * @getfrag: call back function to be used for getting the user data * @from: pointer to user message iov * @length: length of the iov message @@ -3151,6 +3152,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } + /* switch back to head shinfo */ + pinfo = skb_shinfo(p); + if (pinfo->frag_list) goto merge; if (skb_gro_len(p) != pinfo->gso_size) @@ -3973,3 +3977,55 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) return shinfo->gso_size; } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); + +static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) +{ + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); + return NULL; + } + + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + skb->mac_header += VLAN_HLEN; + return skb; +} + +struct sk_buff *skb_vlan_untag(struct sk_buff *skb) +{ + struct vlan_hdr *vhdr; + u16 vlan_tci; + + if (unlikely(vlan_tx_tag_present(skb))) { + /* vlan_tci is already set-up so leave this for another time */ + return skb; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto err_free; + + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + goto err_free; + + vhdr = (struct vlan_hdr *)skb->data; + vlan_tci = ntohs(vhdr->h_vlan_TCI); + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); + + skb_pull_rcsum(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vhdr); + + skb = skb_reorder_vlan_header(skb); + if (unlikely(!skb)) + goto err_free; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + return skb; + +err_free: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_vlan_untag); diff --git a/net/core/sock.c b/net/core/sock.c index 2714811afbd8..611f424fb76b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -166,7 +166,7 @@ EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through - * @cap: The global capbility to use + * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user @@ -183,7 +183,7 @@ EXPORT_SYMBOL(sk_capable); * @sk: Socket to use a capability on or through * @cap: The capability to use * - * Test to see if the opener of the socket had when the socke was created + * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ @@ -1489,9 +1489,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); -#endif spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -1822,6 +1819,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, order); if (page) goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; } order--; } @@ -1863,16 +1863,14 @@ EXPORT_SYMBOL(sock_alloc_send_skb); * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag - * @prio: priority for memory allocation + * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ -bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { - int order; - if (pfrag->page) { if (atomic_read(&pfrag->page->_count) == 1) { pfrag->offset = 0; @@ -1883,20 +1881,21 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) put_page(pfrag->page); } - order = SKB_FRAG_PAGE_ORDER; - do { - gfp_t gfp = prio; - - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; - pfrag->page = alloc_pages(gfp, order); + pfrag->offset = 0; + if (SKB_FRAG_PAGE_ORDER) { + pfrag->page = alloc_pages(gfp | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY, + SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { - pfrag->offset = 0; - pfrag->size = PAGE_SIZE << order; + pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } - } while (--order >= 0); - + } + pfrag->page = alloc_page(gfp); + if (likely(pfrag->page)) { + pfrag->size = PAGE_SIZE; + return true; + } return false; } EXPORT_SYMBOL(skb_page_frag_refill); @@ -2306,9 +2305,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif sk->sk_send_head = NULL; diff --git a/net/core/user_dma.c b/net/core/user_dma.c deleted file mode 100644 index 1b5fefdb8198..000000000000 --- a/net/core/user_dma.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include <linux/dmaengine.h> -#include <linux/socket.h> -#include <linux/export.h> -#include <net/tcp.h> -#include <net/netdma.h> - -#define NET_DMA_DEFAULT_COPYBREAK 4096 - -int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; -EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); - -/** - * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb - buffer to copy - * @offset - offset in the buffer to start copying from - * @iovec - io vector to copy to - * @len - amount of data to copy from buffer to iovec - * @pinned_list - locked iovec buffer data - * - * Note: the iovec is modified during the copy. - */ -int dma_skb_copy_datagram_iovec(struct dma_chan *chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - dma_cookie_t cookie = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_memcpy_to_iovec(chan, to, pinned_list, - skb->data + offset, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - copy = end - offset; - if (copy > 0) { - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - - cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, - frag->page_offset + offset - start, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, - offset - start, - to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - -end: - if (!len) { - skb->dma_cookie = cookie; - return cookie; - } - -fault: - return -EFAULT; -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c index de2c1e719305..f440cc7c9f72 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -848,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { @@ -905,7 +905,7 @@ verify_sock_status: len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); break; } while (1); out: diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c index 016b77ee88f0..6591d27e53a4 100644 --- a/net/ieee802154/6lowpan_rtnl.c +++ b/net/ieee802154/6lowpan_rtnl.c @@ -246,7 +246,7 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, return ERR_PTR(-rc); } } else { - frag = ERR_PTR(ENOMEM); + frag = ERR_PTR(-ENOMEM); } return frag; @@ -437,7 +437,7 @@ static void lowpan_setup(struct net_device *dev) /* Frame Control + Sequence Number + Address fields + Security Header */ dev->hard_header_len = 2 + 1 + 20 + 14; dev->needed_tailroom = 2; /* FCS */ - dev->mtu = 1281; + dev->mtu = IPV6_MIN_MTU; dev->tx_queue_len = 0; dev->flags = IFF_BROADCAST | IFF_MULTICAST; dev->watchdog_timeo = 0; diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index ffec6ce51005..32755cb7e64e 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -355,8 +355,6 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) struct net *net = dev_net(skb->dev); struct lowpan_frag_info *frag_info = lowpan_cb(skb); struct ieee802154_addr source, dest; - struct netns_ieee802154_lowpan *ieee802154_lowpan = - net_ieee802154_lowpan(net); int err; source = mac_cb(skb)->source; @@ -366,8 +364,10 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) if (err < 0) goto err; - if (frag_info->d_size > ieee802154_lowpan->max_dsize) + if (frag_info->d_size > IPV6_MIN_MTU) { + net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; + } fq = fq_find(net, frag_info, &source, &dest); if (fq != NULL) { @@ -415,13 +415,6 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "6lowpanfrag_max_datagram_size", - .data = &init_net.ieee802154_lowpan.max_dsize, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { } }; @@ -458,7 +451,6 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) table[1].data = &ieee802154_lowpan->frags.low_thresh; table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; table[2].data = &ieee802154_lowpan->frags.timeout; - table[3].data = &ieee802154_lowpan->max_dsize; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -533,7 +525,6 @@ static int __net_init lowpan_frags_init_net(struct net *net) ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - ieee802154_lowpan->max_dsize = 0xFFFF; inet_frags_init_net(&ieee802154_lowpan->frags); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5aa4c7c..e9cb2588e416 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) last = li; } if (last) - hlist_add_after_rcu(&last->hlist, &new->hlist); + hlist_add_behind_rcu(&new->hlist, &last->hlist); else hlist_add_before_rcu(&new->hlist, &li->hlist); } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index afed1aac2638..bda4bb8ae260 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -79,10 +79,10 @@ static void __tunnel_dst_set(struct ip_tunnel_dst *idst, idst->saddr = saddr; } -static void tunnel_dst_set(struct ip_tunnel *t, +static noinline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst, __be32 saddr) { - __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst, saddr); + __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); } static void tunnel_dst_reset(struct ip_tunnel *t) @@ -106,7 +106,7 @@ static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, struct dst_entry *dst; rcu_read_lock(); - idst = this_cpu_ptr(t->dst_cache); + idst = raw_cpu_ptr(t->dst_cache); dst = rcu_dereference(idst->dst); if (dst && !atomic_inc_not_zero(&dst->__refcnt)) dst = NULL; @@ -764,9 +764,14 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (!t && (cmd == SIOCADDTUNNEL)) { - t = ip_tunnel_create(net, itn, p); - err = PTR_ERR_OR_ZERO(t); + if (cmd == SIOCADDTUNNEL) { + if (!t) { + t = ip_tunnel_create(net, itn, p); + err = PTR_ERR_OR_ZERO(t); + break; + } + + err = -EEXIST; break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fb173126f03d..7cbcaf4f0194 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -82,6 +82,52 @@ config NF_TABLES_ARP help This option enables the ARP support for nf_tables. +config NF_NAT_IPV4 + tristate "IPv4 NAT" + depends on NF_CONNTRACK_IPV4 + default m if NETFILTER_ADVANCED=n + select NF_NAT + help + The IPv4 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. This can be + controlled by iptables or nft. + +if NF_NAT_IPV4 + +config NF_NAT_SNMP_BASIC + tristate "Basic SNMP-ALG support" + depends on NF_CONNTRACK_SNMP + depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP + ---help--- + + This module implements an Application Layer Gateway (ALG) for + SNMP payloads. In conjunction with NAT, it allows a network + management system to access multiple private networks with + conflicting addresses. It works by modifying IP addresses + inside SNMP payloads to match IP-layer NAT mapping. + + This is the "basic" form of SNMP-ALG, as described in RFC 2962 + + To compile it as a module, choose M here. If unsure, say N. + +config NF_NAT_PROTO_GRE + tristate + depends on NF_CT_PROTO_GRE + +config NF_NAT_PPTP + tristate + depends on NF_CONNTRACK + default NF_CONNTRACK_PPTP + select NF_NAT_PROTO_GRE + +config NF_NAT_H323 + tristate + depends on NF_CONNTRACK + default NF_CONNTRACK_H323 + +endif # NF_NAT_IPV4 + config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n @@ -170,19 +216,21 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. # NAT + specific targets: nf_conntrack -config NF_NAT_IPV4 - tristate "IPv4 NAT" +config IP_NF_NAT + tristate "iptables NAT support" depends on NF_CONNTRACK_IPV4 default m if NETFILTER_ADVANCED=n select NF_NAT + select NF_NAT_IPV4 + select NETFILTER_XT_NAT help - The IPv4 NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in iptables: see the man page for iptables(8). + This enables the `nat' table in iptables. This allows masquerading, + port forwarding and other forms of full Network Address Port + Translation. To compile it as a module, choose M here. If unsure, say N. -if NF_NAT_IPV4 +if IP_NF_NAT config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" @@ -214,47 +262,7 @@ config IP_NF_TARGET_REDIRECT (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_REDIRECT. -endif - -config NF_NAT_SNMP_BASIC - tristate "Basic SNMP-ALG support" - depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4 - depends on NETFILTER_ADVANCED - default NF_NAT && NF_CONNTRACK_SNMP - ---help--- - - This module implements an Application Layer Gateway (ALG) for - SNMP payloads. In conjunction with NAT, it allows a network - management system to access multiple private networks with - conflicting addresses. It works by modifying IP addresses - inside SNMP payloads to match IP-layer NAT mapping. - - This is the "basic" form of SNMP-ALG, as described in RFC 2962 - - To compile it as a module, choose M here. If unsure, say N. - -# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), -# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. -# From kconfig-language.txt: -# -# <expr> '&&' <expr> (6) -# -# (6) Returns the result of min(/expr/, /expr/). - -config NF_NAT_PROTO_GRE - tristate - depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE - -config NF_NAT_PPTP - tristate - depends on NF_CONNTRACK && NF_NAT_IPV4 - default NF_NAT_IPV4 && NF_CONNTRACK_PPTP - select NF_NAT_PROTO_GRE - -config NF_NAT_H323 - tristate - depends on NF_CONNTRACK && NF_NAT_IPV4 - default NF_NAT_IPV4 && NF_CONNTRACK_H323 +endif # IP_NF_NAT # mangle + specific targets config IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 33001621465b..edf4af32e9f2 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 190199851c9a..cbadb942c332 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -746,7 +746,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow } n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); - if (n) { + if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { @@ -1798,8 +1798,6 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; - if (err == -ESRCH) - err = -ENETUNREACH; goto local_input; /* @@ -2267,9 +2265,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, return rt; if (flp4->flowi4_proto) - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); return rt; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 79a007c52558..a9fde0eef77c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -628,15 +628,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#ifdef CONFIG_NET_DMA - { - .procname = "tcp_dma_copybreak", - .data = &sysctl_tcp_dma_copybreak, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif { .procname = "tcp_slow_start_after_idle", .data = &sysctl_tcp_slow_start_after_idle, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 744af67a5989..8ee43ae90396 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -274,7 +274,6 @@ #include <net/tcp.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/netdma.h> #include <net/sock.h> #include <asm/uaccess.h> @@ -426,13 +425,15 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) { - struct skb_shared_info *shinfo = skb_shinfo(skb); + if (sk->sk_tsflags) { + struct skb_shared_info *shinfo = skb_shinfo(skb); - sock_tx_timestamp(sk, &shinfo->tx_flags); - if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP) - shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + sock_tx_timestamp(sk, &shinfo->tx_flags); + if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) + shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + } } /* @@ -1186,13 +1187,6 @@ new_segment: goto wait_for_memory; /* - * All packets are restored as if they have - * already been sent. - */ - if (tp->repair) - TCP_SKB_CB(skb)->when = tcp_time_stamp; - - /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) @@ -1201,6 +1195,13 @@ new_segment: skb_entail(sk, skb); copy = size_goal; max = size_goal; + + /* All packets are restored as if they have + * already been sent. skb_mstamp isn't set to + * avoid wrong rtt estimation. + */ + if (tp->repair) + TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ @@ -1392,7 +1393,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +static void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1468,39 +1469,6 @@ static void tcp_prequeue_process(struct sock *sk) tp->ucopy.memory = 0; } -#ifdef CONFIG_NET_DMA -static void tcp_service_net_dma(struct sock *sk, bool wait) -{ - dma_cookie_t done, used; - dma_cookie_t last_issued; - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->ucopy.dma_chan) - return; - - last_issued = tp->ucopy.dma_cookie; - dma_async_issue_pending(tp->ucopy.dma_chan); - - do { - if (dma_async_is_tx_complete(tp->ucopy.dma_chan, - last_issued, &done, - &used) == DMA_COMPLETE) { - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - break; - } else { - struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_COMPLETE)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - } while (wait); -} -#endif - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1518,7 +1486,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } return NULL; } @@ -1584,11 +1552,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); if (!desc->count) break; tp->copied_seq = seq; @@ -1626,7 +1594,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - bool copied_early = false; struct sk_buff *skb; u32 urg_hole = 0; @@ -1672,28 +1639,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - skb = skb_peek_tail(&sk->sk_receive_queue); - { - int available = 0; - - if (skb) - available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); - if ((available < target) && - (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && - net_dma_find_channel()) { - preempt_enable(); - tp->ucopy.pinned_list = - dma_pin_iovec_pages(msg->msg_iov, len); - } else { - preempt_enable(); - } - } -#endif - do { u32 offset; @@ -1824,16 +1769,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); - tcp_cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1841,11 +1776,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, false); /* Don't block */ - tp->ucopy.wakeup = 0; -#endif - if (user_recv) { int chunk; @@ -1903,43 +1833,13 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - pr_alert("%s: dma_cookie < 0\n", - __func__); - - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } - - dma_async_issue_pending(tp->ucopy.dma_chan); - - if ((offset + used) == skb->len) - copied_early = true; - - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; } } @@ -1959,19 +1859,15 @@ skip_copy: if (tcp_hdr(skb)->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); break; } while (len > 0); @@ -1994,16 +1890,6 @@ skip_copy: tp->ucopy.len = 0; } -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, true); /* Wait for queue to drain */ - tp->ucopy.dma_chan = NULL; - - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2347,9 +2233,6 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a3d47af01906..0185eea59342 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -73,7 +73,6 @@ #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> -#include <net/netdma.h> #include <linux/errqueue.h> int sysctl_tcp_timestamps __read_mostly = 1; @@ -2687,7 +2686,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) */ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) { - struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2713,12 +2711,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) if (recovered) { /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ - icsk->icsk_retransmits = 0; tcp_try_undo_recovery(sk); return; } - if (flag & FLAG_DATA_ACKED) - icsk->icsk_retransmits = 0; if (tcp_is_reno(tp)) { /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. @@ -3050,10 +3045,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; + if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3107,11 +3107,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_stamp = 0; } - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) && - between(skb_shinfo(skb)->tskey, prior_snd_una, - tp->snd_una + 1)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); - if (!fully_acked) break; @@ -3405,8 +3400,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) + if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; + icsk->icsk_retransmits = 0; + } prior_fackets = tp->fackets_out; @@ -4953,53 +4950,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, __tcp_checksum_complete_user(sk, skb); } -#ifdef CONFIG_NET_DMA -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, - int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int dma_cookie; - bool copied_early = false; - - if (tp->ucopy.wakeup) - return false; - - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { - - dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, - tp->ucopy.iov, chunk, - tp->ucopy.pinned_list); - - if (dma_cookie < 0) - goto out; - - tp->ucopy.dma_cookie = dma_cookie; - copied_early = true; - - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - - if ((tp->ucopy.len == 0) || - (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || - (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk); - } - } else if (chunk > 0) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk); - } -out: - return copied_early; -} -#endif /* CONFIG_NET_DMA */ - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5179,27 +5129,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; - int copied_early = 0; bool fragstolen = false; - if (tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len) { -#ifdef CONFIG_NET_DMA - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && - tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { - copied_early = 1; - eaten = 1; - } -#endif - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && !copied_early) { - __set_current_state(TASK_RUNNING); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sock_owned_by_user(sk)) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) - eaten = 1; - } - if (eaten) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5215,9 +5153,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + eaten = 1; } - if (copied_early) - tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -5254,14 +5191,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - if (!copied_early || tp->rcv_nxt != tp->rcv_wup) - __tcp_ack_snd_check(sk, 0); + __tcp_ack_snd_check(sk, 0); no_ack: -#ifdef CONFIG_NET_DMA - if (copied_early) - __skb_queue_tail(&sk->sk_async_wait_queue, skb); - else -#endif if (eaten) kfree_skb_partial(skb, fragstolen); sk->sk_data_ready(sk); @@ -5979,12 +5910,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + if (tcp_death_row.sysctl_tw_recycle) { bool strict; dst = af_ops->route_req(sk, &fl, req, &strict); + if (dst && strict && - !tcp_peer_is_proven(req, dst, true)) { + !tcp_peer_is_proven(req, dst, true, + tmp_opt.saw_tstamp)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -5993,7 +5926,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { + !tcp_peer_is_proven(req, dst, false, + tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 992a1f926009..fbea536cf5c0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -72,7 +72,6 @@ #include <net/inet_common.h> #include <net/timewait_sock.h> #include <net/xfrm.h> -#include <net/netdma.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> #include <net/busy_poll.h> @@ -271,7 +270,7 @@ EXPORT_SYMBOL(tcp_v4_connect); * It can be called through tcp_release_cb() if socket was owned by user * at the time tcp_v4_err() was called to handle ICMP message. */ -static void tcp_v4_mtu_reduced(struct sock *sk) +void tcp_v4_mtu_reduced(struct sock *sk) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -302,6 +301,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } +EXPORT_SYMBOL(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -1167,7 +1167,8 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +static bool __tcp_v4_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1220,6 +1221,17 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) return false; } +static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +{ + bool ret; + + rcu_read_lock(); + ret = __tcp_v4_inbound_md5_hash(sk, skb); + rcu_read_unlock(); + + return ret; +} + #endif static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, @@ -1432,16 +1444,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ - if (tcp_v4_inbound_md5_hash(sk, skb)) - goto discard; -#endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst = sk->sk_rx_dst; @@ -1644,6 +1646,18 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; + +#ifdef CONFIG_TCP_MD5SIG + /* + * We really want to reject the packet as early as possible + * if: + * o We're expecting an MD5'd packet and this is no MD5 tcp option + * o There is an MD5 option and we're not expecting one + */ + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; +#endif + nf_reset(skb); if (sk_filter(sk, skb)) @@ -1655,18 +1669,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); @@ -1773,6 +1777,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific); @@ -1825,11 +1830,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif -#ifdef CONFIG_NET_DMA - /* Cleans up our sk_async_wait_queue */ - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif - /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); @@ -2392,7 +2392,6 @@ struct proto tcp_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0d54e59b9ea8..ed9c9a91851c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -576,7 +576,8 @@ reset: tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, + bool paws_check, bool timestamps) { struct tcp_metrics_block *tm; bool ret; @@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa if (paws_check) { if (tm && (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && - (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW || + !timestamps)) ret = false; else ret = true; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index f597119fc4e7..bc1b83cb8309 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,12 +14,12 @@ #include <net/tcp.h> #include <net/protocol.h> -void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq, - unsigned int mss) +static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, + unsigned int seq, unsigned int mss) { while (skb) { - if (ts_seq < (__u64) seq + mss) { - skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP; + if (before(ts_seq, seq + mss)) { + skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP; skb_shinfo(skb)->tskey = ts_seq; return; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8fcfc91964ec..5a7c41fbc6d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk) __sock_put(sk); } if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { - sk->sk_prot->mtu_reduced(sk); + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } } @@ -1069,6 +1069,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de tcp_verify_left_out(tp); } +static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && + !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { + struct skb_shared_info *shinfo2 = skb_shinfo(skb2); + u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; + + shinfo->tx_flags &= ~tsflags; + shinfo2->tx_flags |= tsflags; + swap(shinfo->tskey, shinfo2->tskey); + } +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -1136,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; + tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -1652,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); + tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(sk, skb, mss_now); @@ -1917,8 +1934,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); - if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { + /* "when" is used as a start point for the retransmit timer */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; goto repair; /* Skip network transmission */ + } cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0b239fc1816e..3e118dfddd02 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1690,14 +1690,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) addrconf_mod_dad_work(ifp, 0); } -/* Join to solicited addr multicast group. */ - +/* Join to solicited addr multicast group. + * caller must hold RTNL */ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; - ASSERT_RTNL(); - if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1705,12 +1703,11 @@ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) ipv6_dev_mc_inc(dev, &maddr); } +/* caller must hold RTNL */ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; - ASSERT_RTNL(); - if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1718,12 +1715,11 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &maddr); } +/* caller must hold RTNL */ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - ASSERT_RTNL(); - if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -1732,12 +1728,11 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) ipv6_dev_ac_inc(ifp->idev->dev, &addr); } +/* caller must hold RTNL */ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - ASSERT_RTNL(); - if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -3099,11 +3094,13 @@ static int addrconf_ifdown(struct net_device *dev, int how) write_unlock_bh(&idev->lock); - /* Step 5: Discard multicast list */ - if (how) + /* Step 5: Discard anycast and multicast list */ + if (how) { + ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - else + } else { ipv6_mc_down(idev); + } idev->tstamp = jiffies; @@ -4773,24 +4770,21 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_leave_solict(ifp->idev, &ifp->addr); if (!ipv6_addr_any(&ifp->peer_addr)) { struct rt6_info *rt; - struct net_device *dev = ifp->idev->dev; - - rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, - dev->ifindex, 1); - if (rt) { - dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); - } + + rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, + ifp->idev->dev, 0, 0); + if (rt && ip6_del_rt(rt)) + dst_free(&rt->dst); } dst_hold(&ifp->rt->dst); if (ip6_del_rt(ifp->rt)) dst_free(&ifp->rt->dst); + + rt_genid_bump_ipv6(net); break; } atomic_inc(&net->ipv6.dev_addr_genid); - rt_genid_bump_ipv6(net); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index e6960457f625..98cc4cd570e2 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -8,6 +8,13 @@ #include <net/addrconf.h> #include <net/ip.h> +/* if ipv6 module registers this function is used by xfrm to force all + * sockets to relookup their nodes - this is fairly expensive, be + * careful + */ +void (*__fib6_flush_trees)(struct net *); +EXPORT_SYMBOL(__fib6_flush_trees); + #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) static inline unsigned int ipv6_addr_scope2type(unsigned int scope) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1722d9..fd0dc47f471d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 210183244689..9a386842fd62 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -77,6 +77,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac->acl_next = NULL; pac->acl_addr = *addr; + rtnl_lock(); rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -137,6 +138,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) error: rcu_read_unlock(); + rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; @@ -171,11 +173,13 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock_bh(&ipv6_sk_ac_lock); + rtnl_lock(); rcu_read_lock(); dev = dev_get_by_index_rcu(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); return 0; @@ -198,6 +202,7 @@ void ipv6_sock_ac_close(struct sock *sk) spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; + rtnl_lock(); rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; @@ -212,6 +217,7 @@ void ipv6_sock_ac_close(struct sock *sk) pac = next; } rcu_read_unlock(); + rtnl_unlock(); } static void aca_put(struct ifacaddr6 *ac) @@ -233,6 +239,8 @@ int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) struct rt6_info *rt; int err; + ASSERT_RTNL(); + idev = in6_dev_get(dev); if (idev == NULL) @@ -302,6 +310,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; + ASSERT_RTNL(); + write_lock_bh(&idev->lock); prev_aca = NULL; for (aca = idev->ac_list; aca; aca = aca->aca_next) { @@ -341,6 +351,27 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) return __ipv6_dev_ac_dec(idev, addr); } +void ipv6_ac_destroy_dev(struct inet6_dev *idev) +{ + struct ifacaddr6 *aca; + + write_lock_bh(&idev->lock); + while ((aca = idev->ac_list) != NULL) { + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->dst); + ip6_del_rt(aca->aca_rt); + + aca_put(aca); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + /* * check if the interface has this anycast address * called with rcu_read_lock() diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cb4459bd1d29..97b9fa8de377 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -643,7 +643,7 @@ static int fib6_commit_metrics(struct dst_entry *dst, if (dst->flags & DST_HOST) { mp = dst_metrics_write_ptr(dst); } else { - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); if (!mp) return -ENOMEM; dst_init_metrics(dst, mp, 0); @@ -1605,6 +1605,24 @@ static void fib6_prune_clones(struct net *net, struct fib6_node *fn) fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); } +static int fib6_update_sernum(struct rt6_info *rt, void *arg) +{ + __u32 sernum = *(__u32 *)arg; + + if (rt->rt6i_node && + rt->rt6i_node->fn_sernum != sernum) + rt->rt6i_node->fn_sernum = sernum; + + return 0; +} + +static void fib6_flush_trees(struct net *net) +{ + __u32 new_sernum = fib6_new_sernum(); + + fib6_clean_all(net, fib6_update_sernum, &new_sernum); +} + /* * Garbage collection */ @@ -1788,6 +1806,8 @@ int __init fib6_init(void) NULL); if (ret) goto out_unregister_subsys; + + __fib6_flush_trees = fib6_flush_trees; out: return ret; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 5f19dfbc4c6a..f304471477dc 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -314,6 +314,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); + if (t && create) + return NULL; if (t || !create) return t; @@ -1724,4 +1726,5 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); +MODULE_ALIAS_RTNL_LINK("ip6gretap"); MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 315a55d66079..0a3448b2888f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1009,7 +1009,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); @@ -1041,7 +1041,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f9de5a695072..69a84b464009 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -364,8 +364,12 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) + ipv6_addr_equal(remote, &t->parms.raddr)) { + if (create) + return NULL; + return t; + } } if (!create) return NULL; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 7f52fd9fa7b0..5833a2244467 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -253,8 +253,12 @@ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) + ipv6_addr_equal(remote, &t->parms.raddr)) { + if (create) + return NULL; + return t; + } } if (!create) return NULL; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 617f0958e164..a23b655a7627 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -172,6 +172,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst->next = NULL; mc_lst->addr = *addr; + rtnl_lock(); rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -185,6 +186,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (dev == NULL) { rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -202,6 +204,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (err) { rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } @@ -212,6 +215,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock(&ipv6_sk_mc_lock); rcu_read_unlock(); + rtnl_unlock(); return 0; } @@ -229,6 +233,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) if (!ipv6_addr_is_multicast(addr)) return -EINVAL; + rtnl_lock(); spin_lock(&ipv6_sk_mc_lock); for (lnk = &np->ipv6_mc_list; (mc_lst = rcu_dereference_protected(*lnk, @@ -252,12 +257,15 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); rcu_read_unlock(); + rtnl_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } spin_unlock(&ipv6_sk_mc_lock); + rtnl_unlock(); return -EADDRNOTAVAIL; } @@ -302,6 +310,7 @@ void ipv6_sock_mc_close(struct sock *sk) if (!rcu_access_pointer(np->ipv6_mc_list)) return; + rtnl_lock(); spin_lock(&ipv6_sk_mc_lock); while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { @@ -328,6 +337,7 @@ void ipv6_sock_mc_close(struct sock *sk) spin_lock(&ipv6_sk_mc_lock); } spin_unlock(&ipv6_sk_mc_lock); + rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, @@ -845,6 +855,8 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) struct ifmcaddr6 *mc; struct inet6_dev *idev; + ASSERT_RTNL(); + /* we need to take a reference on idev */ idev = in6_dev_get(dev); @@ -916,6 +928,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, **map; + ASSERT_RTNL(); + write_lock_bh(&idev->lock); for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ac93df16f5af..2812816aabdc 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -57,9 +57,19 @@ config NFT_REJECT_IPV6 config NF_LOG_IPV6 tristate "IPv6 packet logging" - depends on NETFILTER_ADVANCED + default m if NETFILTER_ADVANCED=n select NF_LOG_COMMON +config NF_NAT_IPV6 + tristate "IPv6 NAT" + depends on NF_CONNTRACK_IPV6 + depends on NETFILTER_ADVANCED + select NF_NAT + help + The IPv6 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. This can be + controlled by iptables or nft. + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 @@ -232,19 +242,21 @@ config IP6_NF_SECURITY If unsure, say N. -config NF_NAT_IPV6 - tristate "IPv6 NAT" +config IP6_NF_NAT + tristate "ip6tables NAT support" depends on NF_CONNTRACK_IPV6 depends on NETFILTER_ADVANCED select NF_NAT + select NF_NAT_IPV6 + select NETFILTER_XT_NAT help - The IPv6 NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in ip6tables, see the man page for ip6tables(8). + This enables the `nat' table in ip6tables. This allows masquerading, + port forwarding and other forms of full Network Address Port + Translation. To compile it as a module, choose M here. If unsure, say N. -if NF_NAT_IPV6 +if IP6_NF_NAT config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" @@ -265,7 +277,7 @@ config IP6_NF_TARGET_NPT To compile it as a module, choose M here. If unsure, say N. -endif # NF_NAT_IPV6 +endif # IP6_NF_NAT endif # IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c0b263104ed2..c3d3286db4bb 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o -obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o +obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o # objects for l3 independent conntrack nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f23fbd28a501..bafde82324c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -314,7 +314,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - rt->rt6i_genid = rt_genid_ipv6(net); INIT_LIST_HEAD(&rt->rt6i_siblings); } return rt; @@ -1098,9 +1097,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) - return NULL; - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) return NULL; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2e9ba035fb5f..6163f851dc01 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 22055b098428..03a5d1ed3340 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -59,7 +59,6 @@ #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> -#include <net/netdma.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> @@ -667,7 +666,8 @@ clear_hash_noput: return 1; } -static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +static int __tcp_v6_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; @@ -707,6 +707,18 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) } return 0; } + +static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __tcp_v6_inbound_md5_hash(sk, skb); + rcu_read_unlock(); + + return ret; +} + #endif static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, @@ -1247,11 +1259,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); -#ifdef CONFIG_TCP_MD5SIG - if (tcp_v6_inbound_md5_hash(sk, skb)) - goto discard; -#endif - if (sk_filter(sk, skb)) goto discard; @@ -1424,6 +1431,11 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; +#ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash(sk, skb)) + goto discard_and_relse; +#endif + if (sk_filter(sk, skb)) goto discard_and_relse; @@ -1433,18 +1445,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); @@ -1582,6 +1584,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v6_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1612,6 +1615,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1851,7 +1855,6 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 9ea0c933b9ff..a37998c6273d 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -622,7 +622,7 @@ void irlap_send_rd_frame(struct irlap_cb *self) frame = (struct rd_frame *)skb_put(tx_skb, 2); frame->caddr = self->caddr; - frame->caddr = RD_RSP | PF_BIT; + frame->control = RD_RSP | PF_BIT; irlap_queue_xmit(self, tx_skb); } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 13752d96275e..b704a9356208 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -755,7 +755,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* If PMTU discovery was enabled, use the MTU that was discovered */ dst = sk_dst_get(tunnel->sock); if (dst != NULL) { - u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock)); + u32 pmtu = dst_mtu(dst); + if (pmtu != 0) session->mtu = session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 0080d2b0a8ae..bb9cbc17d926 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -839,7 +839,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (!(flags & MSG_PEEK)) { spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); *seq = 0; } @@ -861,10 +861,10 @@ copy_uaddr: llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { - spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); - *seq = 0; + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); + sk_eat_skb(sk, skb); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + *seq = 0; } goto out; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 6d537f03c0ba..399ad82c997f 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -541,6 +541,8 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, continue; if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf) continue; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; if (!compat) compat = &sdata->vif.bss_conf.chandef; @@ -1444,7 +1446,7 @@ ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) list_del(&sdata->reserved_chanctx_list); list_move(&sdata->assigned_chanctx_list, - &new_ctx->assigned_vifs); + &ctx->assigned_vifs); sdata->reserved_chanctx = NULL; ieee80211_vif_chanctx_reservation_complete(sdata); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 3db96648b45a..86173c0de40e 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -167,7 +167,7 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n", sta->ampdu_mlme.dialog_token_allocator + 1); p += scnprintf(p, sizeof(buf) + buf - p, - "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); + "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 01eede7406a5..f75e5f132c5a 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1175,8 +1175,8 @@ static void ieee80211_iface_work(struct work_struct *work) if (sta) { u16 last_seq; - last_seq = le16_to_cpu( - sta->last_seq_ctrl[rx_agg->tid]); + last_seq = IEEE80211_SEQ_TO_SN(le16_to_cpu( + sta->last_seq_ctrl[rx_agg->tid])); __ieee80211_start_rx_ba_session(sta, 0, 0, diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 63b874101b27..c47194d27149 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -959,7 +959,8 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, if (!matches_local) event = CNF_RJCT; if (!mesh_plink_free_count(sdata) || - (sta->llid != llid || sta->plid != plid)) + sta->llid != llid || + (sta->plid && sta->plid != plid)) event = CNF_IGNR; else event = CNF_ACPT; @@ -1080,6 +1081,10 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, goto unlock_rcu; } + /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ + if (!sta->plid && event == CNF_ACPT) + sta->plid = plid; + changed |= mesh_plink_fsm(sdata, sta, event); unlock_rcu: diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 31a8afaf7332..b82a12a9f0f1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4376,8 +4376,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (bss->wmm_used && bss->uapsd_supported && - (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) && - sdata->wmm_acm != 0xff) { + (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) { assoc_data->uapsd = true; ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; } else { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index c6ee2139fbc5..a1e433b88c66 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1094,8 +1094,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) unsigned long flags; struct ps_data *ps; - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, + u.ap); + + if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; @@ -1819,7 +1822,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; - sinfo->bss_param.dtim_period = sdata->local->hw.conf.ps_dtim_period; + sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c index aa25269d6902..d593500ceb3c 100644 --- a/net/mac802154/wpan.c +++ b/net/mac802154/wpan.c @@ -462,7 +462,10 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, skb->pkt_type = PACKET_OTHERHOST; break; default: - break; + spin_unlock_bh(&sdata->mib_lock); + pr_debug("invalid dest mode\n"); + kfree_skb(skb); + return NET_RX_DROP; } spin_unlock_bh(&sdata->mib_lock); @@ -573,6 +576,7 @@ void mac802154_wpans_rx(struct mac802154_priv *priv, struct sk_buff *skb) ret = mac802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); + kfree_skb(skb); return; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ad751fe2e82b..6d77cce481d5 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -499,7 +499,7 @@ config NFT_LIMIT config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK - depends on NF_NAT + select NF_NAT tristate "Netfilter nf_tables nat module" help This option adds the "nat" expression that you can use to perform @@ -747,7 +747,9 @@ config NETFILTER_XT_TARGET_LED config NETFILTER_XT_TARGET_LOG tristate "LOG target support" - depends on NF_LOG_IPV4 && NF_LOG_IPV6 + select NF_LOG_COMMON + select NF_LOG_IPV4 + select NF_LOG_IPV6 if IPV6 default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -764,6 +766,14 @@ config NETFILTER_XT_TARGET_MARK (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). +config NETFILTER_XT_NAT + tristate '"SNAT and DNAT" targets support' + depends on NF_NAT + ---help--- + This option enables the SNAT and DNAT targets. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_NETMAP tristate '"NETMAP" target support' depends on NF_NAT @@ -837,6 +847,7 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED + depends on (IPV6 || IPV6=n) depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 8308624a406a..fad5fdba34e5 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -95,7 +95,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o -obj-$(CONFIG_NF_NAT) += xt_nat.o +obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o # targets obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 1fbab0cdd302..024a2e25c8a4 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -35,11 +35,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops); int nf_register_afinfo(const struct nf_afinfo *afinfo) { - int err; - - err = mutex_lock_interruptible(&afinfo_mutex); - if (err < 0) - return err; + mutex_lock(&afinfo_mutex); RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo); mutex_unlock(&afinfo_mutex); return 0; @@ -58,7 +54,7 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo); struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); -#if defined(CONFIG_JUMP_LABEL) +#ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -68,18 +64,15 @@ static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { struct nf_hook_ops *elem; - int err; - err = mutex_lock_interruptible(&nf_hook_mutex); - if (err < 0) - return err; + mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { if (reg->priority < elem->priority) break; } list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); -#if defined(CONFIG_JUMP_LABEL) +#ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; @@ -91,7 +84,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); -#if defined(CONFIG_JUMP_LABEL) +#ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e6836755c45d..5c34e8d42e01 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1906,7 +1906,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8416307fdd1d..fd3f444a4f96 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2271,10 +2271,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - if (mutex_lock_interruptible(&ipvs->sync_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } + mutex_lock(&ipvs->sync_mutex); if (cmd == IP_VS_SO_SET_STARTDAEMON) ret = start_sync_thread(net, dm->state, dm->mcast_ifn, dm->syncid); @@ -2284,11 +2281,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) goto out_dec; } - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - + mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(net, false); @@ -2573,9 +2566,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); - if (mutex_lock_interruptible(&ipvs->sync_mutex)) - return -ERESTARTSYS; - + mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, @@ -2594,9 +2585,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - + mutex_lock(&__ip_vs_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 6f70bdd3a90a..56896a412bce 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -38,6 +38,7 @@ #include <net/route.h> /* for ip_route_output */ #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/ip_tunnels.h> #include <net/addrconf.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> @@ -862,11 +863,15 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, old_iph = ip_hdr(skb); } - skb->transport_header = skb->network_header; - /* fix old IP header checksum */ ip_send_check(old_iph); + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto tx_error; + + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -900,7 +905,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; tx_error: - kfree_skb(skb); + if (!IS_ERR(skb)) + kfree_skb(skb); rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; @@ -953,6 +959,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, old_iph = ipv6_hdr(skb); } + /* GSO: we need to provide proper SKB_GSO_ value for IPv6 */ + skb = iptunnel_handle_offloads(skb, false, 0); /* SKB_GSO_SIT/IPV6 */ + if (IS_ERR(skb)) + goto tx_error; + skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct ipv6hdr)); @@ -988,7 +999,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; tx_error: - kfree_skb(skb); + if (!IS_ERR(skb)) + kfree_skb(skb); rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index f042ae521557..c68c1e58b362 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -26,9 +26,7 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg) struct nf_sockopt_ops *ops; int ret = 0; - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - + mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == reg->pf && (overlap(ops->set_optmin, ops->set_optmax, @@ -65,9 +63,7 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, { struct nf_sockopt_ops *ops; - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == pf) { if (!try_module_get(ops->owner)) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b8035c2d6667..deeb95fb7028 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -899,6 +899,9 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) static void nft_chain_stats_replace(struct nft_base_chain *chain, struct nft_stats __percpu *newstats) { + if (newstats == NULL) + return; + if (chain->stats) { struct nft_stats __percpu *oldstats = nft_dereference(chain->stats); @@ -3134,16 +3137,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, goto err2; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) + if (trans == NULL) { + err = -ENOMEM; goto err2; + } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); - - nft_data_uninit(&elem.key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) - nft_data_uninit(&elem.data, set->dtype); - return 0; err2: nft_data_uninit(&elem.key, desc.type); @@ -3310,7 +3310,7 @@ static int nf_tables_commit(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; - struct nft_set *set; + struct nft_trans_elem *te; /* Bump generation counter, invalidate any dump in progress */ while (++net->nft.base_seq == 0); @@ -3396,13 +3396,17 @@ static int nf_tables_commit(struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: - nf_tables_setelem_notify(&trans->ctx, - nft_trans_elem_set(trans), - &nft_trans_elem(trans), + te = (struct nft_trans_elem *)trans->data; + nf_tables_setelem_notify(&trans->ctx, te->set, + &te->elem, NFT_MSG_DELSETELEM, 0); - set = nft_trans_elem_set(trans); - set->ops->get(set, &nft_trans_elem(trans)); - set->ops->remove(set, &nft_trans_elem(trans)); + te->set->ops->get(te->set, &te->elem); + te->set->ops->remove(te->set, &te->elem); + nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); + if (te->elem.flags & NFT_SET_MAP) { + nft_data_uninit(&te->elem.data, + te->set->dtype); + } nft_trans_destroy(trans); break; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c138b8fbe280..f37f0716a9fc 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -222,6 +222,51 @@ replay: } } +struct nfnl_err { + struct list_head head; + struct nlmsghdr *nlh; + int err; +}; + +static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err) +{ + struct nfnl_err *nfnl_err; + + nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL); + if (nfnl_err == NULL) + return -ENOMEM; + + nfnl_err->nlh = nlh; + nfnl_err->err = err; + list_add_tail(&nfnl_err->head, list); + + return 0; +} + +static void nfnl_err_del(struct nfnl_err *nfnl_err) +{ + list_del(&nfnl_err->head); + kfree(nfnl_err); +} + +static void nfnl_err_reset(struct list_head *err_list) +{ + struct nfnl_err *nfnl_err, *next; + + list_for_each_entry_safe(nfnl_err, next, err_list, head) + nfnl_err_del(nfnl_err); +} + +static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) +{ + struct nfnl_err *nfnl_err, *next; + + list_for_each_entry_safe(nfnl_err, next, err_list, head) { + netlink_ack(skb, nfnl_err->nlh, nfnl_err->err); + nfnl_err_del(nfnl_err); + } +} + static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u_int16_t subsys_id) { @@ -230,6 +275,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; bool success = true, done = false; + static LIST_HEAD(err_list); int err; if (subsys_id >= NFNL_SUBSYS_COUNT) @@ -287,6 +333,7 @@ replay: type = nlh->nlmsg_type; if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ + nfnl_err_reset(&err_list); success = false; goto done; } else if (type == NFNL_MSG_BATCH_END) { @@ -333,6 +380,7 @@ replay: * original skb. */ if (err == -EAGAIN) { + nfnl_err_reset(&err_list); ss->abort(skb); nfnl_unlock(subsys_id); kfree_skb(nskb); @@ -341,11 +389,24 @@ replay: } ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) { + /* Errors are delivered once the full batch has been + * processed, this avoids that the same error is + * reported several times when replaying the batch. + */ + if (nfnl_err_add(&err_list, nlh, err) < 0) { + /* We failed to enqueue an error, reset the + * list of errors and send OOM to userspace + * pointing to the batch header. + */ + nfnl_err_reset(&err_list); + netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + success = false; + goto done; + } /* We don't stop processing the batch on errors, thus, * userspace gets all the errors that the batch * triggers. */ - netlink_ack(skb, nlh, err); if (err) success = false; } @@ -361,6 +422,7 @@ done: else ss->abort(skb); + nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); } diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 28fb8f38e6ba..8892b7b6184a 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -180,15 +180,17 @@ static int nft_hash_init(const struct nft_set *set, static void nft_hash_destroy(const struct nft_set *set) { const struct rhashtable *priv = nft_set_priv(set); - const struct bucket_table *tbl; + const struct bucket_table *tbl = priv->tbl; struct nft_hash_elem *he, *next; unsigned int i; - tbl = rht_dereference(priv->tbl, priv); - for (i = 0; i < tbl->size; i++) - rht_for_each_entry_safe(he, next, tbl->buckets[i], priv, node) + for (i = 0; i < tbl->size; i++) { + for (he = rht_entry(tbl->buckets[i], struct nft_hash_elem, node); + he != NULL; he = next) { + next = rht_entry(he->node.next, struct nft_hash_elem, node); nft_hash_elem_destroy(set, he); - + } + } rhashtable_destroy(priv); } diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index e1836ff88199..46214f245665 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -234,13 +234,11 @@ static void nft_rbtree_destroy(const struct nft_set *set) struct nft_rbtree_elem *rbe; struct rb_node *node; - spin_lock_bh(&nft_rbtree_lock); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_rbtree_elem_destroy(set, rbe); } - spin_unlock_bh(&nft_rbtree_lock); } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 47b978bc3100..272ae4d6fdf4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -71,18 +71,14 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { static const unsigned int xt_jumpstack_multiplier = 2; /* Registration hooks for targets. */ -int -xt_register_target(struct xt_target *target) +int xt_register_target(struct xt_target *target) { u_int8_t af = target->family; - int ret; - ret = mutex_lock_interruptible(&xt[af].mutex); - if (ret != 0) - return ret; + mutex_lock(&xt[af].mutex); list_add(&target->list, &xt[af].target); mutex_unlock(&xt[af].mutex); - return ret; + return 0; } EXPORT_SYMBOL(xt_register_target); @@ -125,20 +121,14 @@ xt_unregister_targets(struct xt_target *target, unsigned int n) } EXPORT_SYMBOL(xt_unregister_targets); -int -xt_register_match(struct xt_match *match) +int xt_register_match(struct xt_match *match) { u_int8_t af = match->family; - int ret; - - ret = mutex_lock_interruptible(&xt[af].mutex); - if (ret != 0) - return ret; + mutex_lock(&xt[af].mutex); list_add(&match->list, &xt[af].match); mutex_unlock(&xt[af].mutex); - - return ret; + return 0; } EXPORT_SYMBOL(xt_register_match); @@ -194,9 +184,7 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) struct xt_match *m; int err = -ENOENT; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision == revision) { @@ -239,9 +227,7 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) struct xt_target *t; int err = -ENOENT; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision == revision) { @@ -323,10 +309,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, { int have_rev, best = -1; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) { - *err = -EINTR; - return 1; - } + mutex_lock(&xt[af].mutex); if (target == 1) have_rev = target_revfn(af, name, revision, &best); else @@ -732,9 +715,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, { struct xt_table *t; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &net->xt.tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; @@ -883,10 +864,7 @@ struct xt_table *xt_register_table(struct net *net, goto out; } - ret = mutex_lock_interruptible(&xt[table->af].mutex); - if (ret != 0) - goto out_free; - + mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ list_for_each_entry(t, &net->xt.tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { @@ -911,9 +889,8 @@ struct xt_table *xt_register_table(struct net *net, mutex_unlock(&xt[table->af].mutex); return table; - unlock: +unlock: mutex_unlock(&xt[table->af].mutex); -out_free: kfree(table); out: return ERR_PTR(ret); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index f4e833005320..7198d660b4de 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -31,7 +31,7 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; - return info->id ? 0 : -EINVAL; + return 0; } static bool diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 479a344563d8..c416725d28c4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -104,6 +104,7 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); /* Protects netlink socket hash table mutations */ DEFINE_MUTEX(nl_sk_hash_lock); +EXPORT_SYMBOL_GPL(nl_sk_hash_lock); static int lockdep_nl_sk_hash_is_held(void) { @@ -212,7 +213,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; - + skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); @@ -2920,6 +2921,7 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) } static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { rcu_read_lock(); return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -2969,6 +2971,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void netlink_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { rcu_read_unlock(); } diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 60f631fb7087..b20a1731759b 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -73,5 +73,6 @@ struct netlink_table { extern struct netlink_table *nl_table; extern rwlock_t nl_table_lock; +extern struct mutex nl_sk_hash_lock; #endif diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 7301850eb56f..de8c74a3c061 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -170,6 +170,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) req = nlmsg_data(cb->nlh); + mutex_lock(&nl_sk_hash_lock); read_lock(&nl_table_lock); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { @@ -183,6 +184,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } else { if (req->sdiag_protocol >= MAX_LINKS) { read_unlock(&nl_table_lock); + mutex_unlock(&nl_sk_hash_lock); return -ENOENT; } @@ -190,6 +192,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } read_unlock(&nl_table_lock); + mutex_unlock(&nl_sk_hash_lock); return skb->len; } diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index fe5cda0deb39..5231652a95d9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -42,6 +42,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, static int make_writable(struct sk_buff *skb, int write_len) { + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; @@ -70,6 +73,8 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); return 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7ad3f029baae..64dc864a417f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -47,8 +47,6 @@ #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> -#include <linux/genetlink.h> -#include <net/genetlink.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -80,11 +78,12 @@ static const struct genl_multicast_group ovs_dp_vport_multicast_group = { /* Check if need to build a reply message. * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ -static bool ovs_must_notify(struct genl_info *info, - const struct genl_multicast_group *grp) +static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, + unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || - netlink_has_listeners(genl_info_net(info)->genl_sock, 0); + genl_has_listeners(family, genl_info_net(info)->genl_sock, + group); } static void ovs_notify(struct genl_family *family, @@ -267,8 +266,11 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) upcall.key = &key; upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); - ovs_dp_upcall(dp, skb, &upcall); - consume_skb(skb); + error = ovs_dp_upcall(dp, skb, &upcall); + if (unlikely(error)) + kfree_skb(skb); + else + consume_skb(skb); stats_counter = &stats->n_missed; goto out; } @@ -406,7 +408,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, { struct ovs_header *upcall; struct sk_buff *nskb = NULL; - struct sk_buff *user_skb; /* to be queued to userspace */ + struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; struct genl_info info = { .dst_sk = ovs_dp_get_net(dp)->genl_sock, @@ -496,9 +498,11 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); + user_skb = NULL; out: if (err) skb_tx_error(skb); + kfree_skb(user_skb); kfree_skb(nskb); return err; } @@ -760,7 +764,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act { struct sk_buff *skb; - if (!always && !ovs_must_notify(info, &ovs_dp_flow_multicast_group)) + if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) return NULL; skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 702fb21bfe15..6d8f2ec481d9 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -137,8 +137,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); - if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { + kfree(vport); return ERR_PTR(-EINVAL); + } vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->percpu_stats) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8d9f8042705a..93896d2092f6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -632,6 +632,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po, tx_ring); prb_open_block(p1, pbd); @@ -1942,6 +1943,18 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if ((int)snaplen < 0) snaplen = 0; } + } else if (unlikely(macoff + snaplen > + GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { + u32 nval; + + nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; + pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", + snaplen, nval, macoff); + snaplen = nval; + if (unlikely((int)snaplen < 0)) { + snaplen = 0; + macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; + } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, @@ -3783,6 +3796,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) goto out; + if (po->tp_version >= TPACKET_V3 && + (int)(req->tp_block_size - + BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) + goto out; if (unlikely(req->tp_frame_size < po->tp_hdrlen + po->tp_reserve)) goto out; diff --git a/net/packet/internal.h b/net/packet/internal.h index eb9580a6b25f..cdddf6a30399 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -29,6 +29,7 @@ struct tpacket_kbdq_core { char *pkblk_start; char *pkblk_end; int kblk_size; + unsigned int max_frame_len; unsigned int knum_blocks; uint64_t knxt_seq_num; char *prev; diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 14c98e48f261..0f62326c0f5e 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -54,7 +54,7 @@ static int rfkill_gpio_set_power(void *data, bool blocked) if (blocked && !IS_ERR(rfkill->clk) && rfkill->clk_enabled) clk_disable(rfkill->clk); - rfkill->clk_enabled = blocked; + rfkill->clk_enabled = !blocked; return 0; } @@ -158,10 +158,12 @@ static const struct acpi_device_id rfkill_acpi_match[] = { { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, { "BCM2E39", RFKILL_TYPE_BLUETOOTH }, { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, + { "BCM2E64", RFKILL_TYPE_BLUETOOTH }, { "BCM4752", RFKILL_TYPE_GPS }, { "LNV4752", RFKILL_TYPE_GPS }, { }, }; +MODULE_DEVICE_TABLE(acpi, rfkill_acpi_match); #endif static struct platform_driver rfkill_gpio_driver = { diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index b45d080e64a7..1b24191167f1 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -1143,7 +1143,7 @@ static long rxrpc_read(const struct key *key, if (copy_to_user(xdr, (s), _l) != 0) \ goto fault; \ if (_l & 3 && \ - copy_to_user((u8 *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ + copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ goto fault; \ xdr += (_l + 3) >> 2; \ } while(0) diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 3a633debb6df..ad57f4444b9c 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -526,9 +526,11 @@ pop_stack: match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); - if (tcf_em_early_end(cur_match, res)) + if (tcf_em_early_end(cur_match, res)) { + if (tcf_em_is_inverted(cur_match)) + res = !res; goto pop_stack; - else { + } else { match_idx++; goto proceed; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ead526467cca..762a04bb8f6d 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -159,7 +159,6 @@ struct cbq_sched_data { struct cbq_class *tx_borrowed; int tx_len; psched_time_t now; /* Cached timestamp */ - psched_time_t now_rt; /* Cached real time */ unsigned int pmask; struct hrtimer delay_timer; @@ -353,12 +352,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) int toplevel = q->toplevel; if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { - psched_time_t now; - psched_tdiff_t incr; - - now = psched_get_time(); - incr = now - q->now_rt; - now = q->now + incr; + psched_time_t now = psched_get_time(); do { if (cl->undertime < now) { @@ -700,8 +694,13 @@ cbq_update(struct cbq_sched_data *q) struct cbq_class *this = q->tx_class; struct cbq_class *cl = this; int len = q->tx_len; + psched_time_t now; q->tx_class = NULL; + /* Time integrator. We calculate EOS time + * by adding expected packet transmission time. + */ + now = q->now + L2T(&q->link, len); for ( ; cl; cl = cl->share) { long avgidle = cl->avgidle; @@ -717,7 +716,7 @@ cbq_update(struct cbq_sched_data *q) * idle = (now - last) - last_pktlen/rate */ - idle = q->now - cl->last; + idle = now - cl->last; if ((unsigned long)idle > 128*1024*1024) { avgidle = cl->maxidle; } else { @@ -761,7 +760,7 @@ cbq_update(struct cbq_sched_data *q) idle -= L2T(&q->link, len); idle += L2T(cl, len); - cl->undertime = q->now + idle; + cl->undertime = now + idle; } else { /* Underlimit */ @@ -771,7 +770,8 @@ cbq_update(struct cbq_sched_data *q) else cl->avgidle = avgidle; } - cl->last = q->now; + if ((s64)(now - cl->last) > 0) + cl->last = now; } cbq_update_toplevel(q, this, q->tx_borrowed); @@ -943,31 +943,13 @@ cbq_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct cbq_sched_data *q = qdisc_priv(sch); psched_time_t now; - psched_tdiff_t incr; now = psched_get_time(); - incr = now - q->now_rt; - - if (q->tx_class) { - psched_tdiff_t incr2; - /* Time integrator. We calculate EOS time - * by adding expected packet transmission time. - * If real time is greater, we warp artificial clock, - * so that: - * - * cbq_time = max(real_time, work); - */ - incr2 = L2T(&q->link, q->tx_len); - q->now += incr2; + + if (q->tx_class) cbq_update(q); - if ((incr -= incr2) < 0) - incr = 0; - q->now += incr; - } else { - if (now > q->now) - q->now = now; - } - q->now_rt = now; + + q->now = now; for (;;) { q->wd_expires = 0; @@ -1223,7 +1205,6 @@ cbq_reset(struct Qdisc *sch) hrtimer_cancel(&q->delay_timer); q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); - q->now_rt = q->now; for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) q->active[prio] = NULL; @@ -1407,7 +1388,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); - q->now_rt = q->now; cbq_link_class(&q->link); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ed30e436128b..fb666d1e4de3 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -133,10 +133,16 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } +/* private part of skb->cb[] that a qdisc is allowed to use + * is limited to QDISC_CB_PRIV_LEN bytes. + * As a flow key might be too large, we store a part of it only. + */ +#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) + struct choke_skb_cb { u16 classid; u8 keys_valid; - struct flow_keys keys; + u8 keys[QDISC_CB_PRIV_LEN - 3]; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -163,22 +169,26 @@ static u16 choke_get_classid(const struct sk_buff *skb) static bool choke_match_flow(struct sk_buff *skb1, struct sk_buff *skb2) { + struct flow_keys temp; + if (skb1->protocol != skb2->protocol) return false; if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys); + skb_flow_dissect(skb1, &temp); + memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys); + skb_flow_dissect(skb2, &temp); + memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - sizeof(struct flow_keys)); + CHOKE_K_LEN); } /* diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 06a9ee6b2d3a..a88b8524846e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -813,6 +813,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, else { dst_release(transport->dst); transport->dst = NULL; + ulp_notify = false; } spc_state = SCTP_ADDR_UNREACHABLE; @@ -1244,7 +1245,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, { u8 score_curr, score_best; - if (best == NULL) + if (best == NULL || curr == best) return curr; score_curr = sctp_trans_score(curr); @@ -1355,14 +1356,11 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc) trans_sec = trans_pri; /* If we failed to find a usable transport, just camp on the - * primary or retran, even if they are inactive, if possible - * pick a PF iff it's the better choice. + * active or pick a PF iff it's the better choice. */ if (trans_pri == NULL) { - trans_pri = sctp_trans_elect_best(asoc->peer.primary_path, - asoc->peer.retran_path); - trans_pri = sctp_trans_elect_best(trans_pri, trans_pf); - trans_sec = asoc->peer.primary_path; + trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf); + trans_sec = trans_pri; } /* Set the active and retran transports. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index eb71d49e7653..634a2abb5f3a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4243,7 +4243,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, transport = asoc->peer.primary_path; status.sstat_assoc_id = sctp_assoc2id(asoc); - status.sstat_state = asoc->state; + status.sstat_state = sctp_assoc_to_state(asoc); status.sstat_rwnd = asoc->peer.rwnd; status.sstat_unackdata = asoc->unack_data; diff --git a/net/socket.c b/net/socket.c index ae89569a2db5..4cdbc107606f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -610,20 +610,26 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) +void sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) { - *tx_flags = 0; + u8 flags = *tx_flags; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) - *tx_flags |= SKBTX_HW_TSTAMP; + flags |= SKBTX_HW_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) - *tx_flags |= SKBTX_SW_TSTAMP; + flags |= SKBTX_SW_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) - *tx_flags |= SKBTX_SCHED_TSTAMP; + flags |= SKBTX_SCHED_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) - *tx_flags |= SKBTX_ACK_TSTAMP; + flags |= SKBTX_ACK_TSTAMP; if (sock_flag(sk, SOCK_WIFI_STATUS)) - *tx_flags |= SKBTX_WIFI_STATUS; + flags |= SKBTX_WIFI_STATUS; + + *tx_flags = flags; } EXPORT_SYMBOL(sock_tx_timestamp); @@ -728,8 +734,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, } memset(&tss, 0, sizeof(tss)); - if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE || - skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP) && + if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && @@ -1991,6 +1996,9 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(kmsg, umsg, sizeof(struct msghdr))) return -EFAULT; + if (kmsg->msg_name == NULL) + kmsg->msg_namelen = 0; + if (kmsg->msg_namelen < 0) return -EINVAL; @@ -2596,7 +2604,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the - * socket interface. The value ops->family coresponds to the + * socket interface. The value ops->family corresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index a622ad64acd8..2e0a6f92e563 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, len = (buf + buflen) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); if (p) { - unsigned long scope_id = 0; + u32 scope_id = 0; struct net_device *dev; dev = dev_get_by_name(net, p); @@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, scope_id = dev->ifindex; dev_put(dev); } else { - if (strict_strtoul(p, 10, &scope_id) == 0) { + if (kstrtou32(p, 10, &scope_id) == 0) { kfree(p); return 0; } @@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags) * @sap: buffer into which to plant socket address * @salen: size of buffer * - * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and + * @uaddr does not have to be '\0'-terminated, but kstrtou8() and * rpc_pton() require proper string termination to be successful. * * Returns the size of the socket address if successful; otherwise @@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr, const size_t salen) { char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')]; - unsigned long portlo, porthi; + u8 portlo, porthi; unsigned short port; if (uaddr_len > RPCBIND_MAXUADDRLEN) @@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr, c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; - if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0)) - return 0; - if (unlikely(portlo > 255)) + if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0)) return 0; *c = '\0'; c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; - if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0)) - return 0; - if (unlikely(porthi > 255)) + if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0)) return 0; port = (unsigned short)((porthi << 8) | portlo); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index f77366717420..383eb919ac0b 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) if (!val) goto out_inval; - ret = strict_strtoul(val, 0, &num); + ret = kstrtoul(val, 0, &num); if (ret == -EINVAL) goto out_inval; nbits = fls(num); @@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = { module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); +static unsigned long auth_max_cred_cachesize = ULONG_MAX; +module_param(auth_max_cred_cachesize, ulong, 0644); +MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size"); + static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor > RPC_AUTH_MAXFLAVOR) @@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred) } EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire); +char * +rpcauth_stringify_acceptor(struct rpc_cred *cred) +{ + if (!cred->cr_ops->crstringify_acceptor) + return NULL; + return cred->cr_ops->crstringify_acceptor(cred); +} +EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor); + /* * Destroy a list of credentials */ @@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) return freed; } +static unsigned long +rpcauth_cache_do_shrink(int nr_to_scan) +{ + LIST_HEAD(free); + unsigned long freed; + + spin_lock(&rpc_credcache_lock); + freed = rpcauth_prune_expired(&free, nr_to_scan); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); + + return freed; +} + /* * Run memory cache shrinker. */ @@ -479,9 +506,6 @@ static unsigned long rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - LIST_HEAD(free); - unsigned long freed; - if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; @@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (list_empty(&cred_unused)) return SHRINK_STOP; - spin_lock(&rpc_credcache_lock); - freed = rpcauth_prune_expired(&free, sc->nr_to_scan); - spin_unlock(&rpc_credcache_lock); - rpcauth_destroy_credlist(&free); - - return freed; + return rpcauth_cache_do_shrink(sc->nr_to_scan); } static unsigned long @@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; } +static void +rpcauth_cache_enforce_limit(void) +{ + unsigned long diff; + unsigned int nr_to_scan; + + if (number_cred_unused <= auth_max_cred_cachesize) + return; + diff = number_cred_unused - auth_max_cred_cachesize; + nr_to_scan = 100; + if (diff < nr_to_scan) + nr_to_scan = diff; + rpcauth_cache_do_shrink(nr_to_scan); +} + /* * Look up a process' credentials in the authentication cache */ @@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; + if (flags & RPCAUTH_LOOKUP_RCU) { + if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && + !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) + cred = entry; + break; + } spin_lock(&cache->lock); if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { spin_unlock(&cache->lock); @@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (cred != NULL) goto found; + if (flags & RPCAUTH_LOOKUP_RCU) + return ERR_PTR(-ECHILD); + new = auth->au_ops->crcreate(auth, acred, flags); if (IS_ERR(new)) { cred = new; @@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); + rpcauth_cache_enforce_limit(); found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL && @@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) memset(&acred, 0, sizeof(acred)); acred.uid = cred->fsuid; acred.gid = cred->fsgid; - acred.group_info = get_group_info(((struct cred *)cred)->group_info); - + acred.group_info = cred->group_info; ret = auth->au_ops->lookup_cred(auth, &acred, flags); - put_group_info(acred.group_info); return ret; } EXPORT_SYMBOL_GPL(rpcauth_lookupcred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index ed04869b2d4f..6f6b829c9e8e 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_cred); +struct rpc_cred *rpc_lookup_cred_nonblock(void) +{ + return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); +} +EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock); + /* * Public call interface for looking up machine creds. */ diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index b6e440baccc3..afb292cd797d 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred) struct gss_cl_ctx *ctx = NULL; rcu_read_lock(); - if (gss_cred->gc_ctx) - ctx = gss_get_ctx(gss_cred->gc_ctx); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (ctx) + gss_get_ctx(ctx); rcu_read_unlock(); return ctx; } @@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct p = ERR_PTR(ret); goto err; } - dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u\n", - __func__, ctx->gc_expiry, now, timeout); - return q; + + /* is there any trailing data? */ + if (q == end) { + p = q; + goto done; + } + + /* pull in acceptor name (if there is one) */ + p = simple_get_netobj(q, end, &ctx->gc_acceptor); + if (IS_ERR(p)) + goto err; +done: + dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n", + __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, + ctx->gc_acceptor.data); + return p; err: dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p)); return p; @@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); + struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); struct rpc_task *task; - if (gss_cred->gc_ctx == NULL || - test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) return 0; - gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY; + ctx->gc_proc = RPC_GSS_PROC_DESTROY; cred->cr_ops = &gss_nullops; /* Take a reference to ensure the cred will be destroyed either @@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx) gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); + kfree(ctx->gc_acceptor.data); kfree(ctx); } @@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); - struct gss_cl_ctx *ctx = gss_cred->gc_ctx; + struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); call_rcu(&cred->cr_rcu, gss_free_cred_callback); @@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) return err; } +static char * +gss_stringify_acceptor(struct rpc_cred *cred) +{ + char *string = NULL; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx; + struct xdr_netobj *acceptor; + + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (!ctx) + goto out; + + acceptor = &ctx->gc_acceptor; + + /* no point if there's no string */ + if (!acceptor->len) + goto out; + + string = kmalloc(acceptor->len + 1, GFP_KERNEL); + if (!string) + goto out; + + memcpy(string, acceptor->data, acceptor->len); + string[acceptor->len] = '\0'; +out: + rcu_read_unlock(); + return string; +} + /* * Returns -EACCES if GSS context is NULL or will expire within the * timeout (miliseconds) @@ -1340,15 +1385,16 @@ static int gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx; unsigned long now = jiffies; unsigned long expire; - if (gss_cred->gc_ctx == NULL) - return -EACCES; - - expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ); - - if (time_after(now, expire)) + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (ctx) + expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ); + rcu_read_unlock(); + if (!ctx || time_after(now, expire)) return -EACCES; return 0; } @@ -1357,13 +1403,19 @@ static int gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx; int ret; if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ - if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (!ctx || time_after(jiffies, ctx->gc_expiry)) { + rcu_read_unlock(); return 0; + } + rcu_read_unlock(); if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: @@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = { }; static const struct rpc_credops gss_credops = { - .cr_name = "AUTH_GSS", - .crdestroy = gss_destroy_cred, - .cr_init = gss_cred_init, - .crbind = rpcauth_generic_bind_cred, - .crmatch = gss_match, - .crmarshal = gss_marshal, - .crrefresh = gss_refresh, - .crvalidate = gss_validate, - .crwrap_req = gss_wrap_req, - .crunwrap_resp = gss_unwrap_resp, - .crkey_timeout = gss_key_timeout, + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_cred, + .cr_init = gss_cred_init, + .crbind = rpcauth_generic_bind_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, + .crkey_timeout = gss_key_timeout, + .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_credops gss_nullops = { - .cr_name = "AUTH_GSS", - .crdestroy = gss_destroy_nullcred, - .crbind = rpcauth_generic_bind_cred, - .crmatch = gss_match, - .crmarshal = gss_marshal, - .crrefresh = gss_refresh_null, - .crvalidate = gss_validate, - .crwrap_req = gss_wrap_req, - .crunwrap_resp = gss_unwrap_resp, + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_nullcred, + .crbind = rpcauth_generic_bind_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh_null, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, + .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 0f43e894bc0a..f5ed9f6ece06 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -641,7 +641,7 @@ out: u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, - struct xdr_buf *buf, int ec, struct page **pages) + struct xdr_buf *buf, struct page **pages) { u32 err; struct xdr_netobj hmac; @@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ecptr = buf->tail[0].iov_base; } - memset(ecptr, 'X', ec); - buf->tail[0].iov_len += ec; - buf->len += ec; - /* copy plaintext gss token header after filler (if any) */ - memcpy(ecptr + ec, buf->head[0].iov_base + offset, - GSS_KRB5_TOK_HDR_LEN); + memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN); buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; buf->len += GSS_KRB5_TOK_HDR_LEN; diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 62ae3273186c..42768e5c3994 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -70,31 +70,37 @@ DEFINE_SPINLOCK(krb5_seq_lock); -static char * +static void * setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) { - __be16 *ptr, *krb5_hdr; + u16 *ptr; + void *krb5_hdr; int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; token->len = g_token_size(&ctx->mech_used, body_size); - ptr = (__be16 *)token->data; + ptr = (u16 *)token->data; g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); /* ptr now at start of header described in rfc 1964, section 1.2.1: */ krb5_hdr = ptr; *ptr++ = KG_TOK_MIC_MSG; - *ptr++ = cpu_to_le16(ctx->gk5e->signalg); + /* + * signalg is stored as if it were converted from LE to host endian, even + * though it's an opaque pair of bytes according to the RFC. + */ + *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg); *ptr++ = SEAL_ALG_NONE; - *ptr++ = 0xffff; + *ptr = 0xffff; - return (char *)krb5_hdr; + return krb5_hdr; } static void * setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) { - __be16 *ptr, *krb5_hdr; + u16 *ptr; + void *krb5_hdr; u8 *p, flags = 0x00; if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) @@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) /* Per rfc 4121, sec 4.2.6.1, there is no header, * just start the token */ - krb5_hdr = ptr = (__be16 *)token->data; + krb5_hdr = ptr = (u16 *)token->data; *ptr++ = KG2_TOK_MIC; p = (u8 *)ptr; *p++ = flags; *p++ = 0xff; - ptr = (__be16 *)p; - *ptr++ = 0xffff; + ptr = (u16 *)p; *ptr++ = 0xffff; + *ptr = 0xffff; token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; return krb5_hdr; @@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, spin_lock(&krb5_seq_lock); seq_send = ctx->seq_send64++; spin_unlock(&krb5_seq_lock); - *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); if (ctx->initiate) { cksumkey = ctx->initiator_sign; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 42560e55d978..4b614c604fe0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; - *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); - memset(ptr + 4, 0xff, 4); - *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + /* + * signalg and sealalg are stored as if they were converted from LE + * to host endian, even though they're opaque pairs of bytes according + * to the RFC. + */ + *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); + *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + ptr[6] = 0xff; + ptr[7] = 0xff; gss_krb5_make_confounder(msg_start, conflen); @@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, u8 *ptr, *plainhdr; s32 now; u8 flags = 0x00; - __be16 *be16ptr, ec = 0; + __be16 *be16ptr; __be64 *be64ptr; u32 err; @@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, be16ptr = (__be16 *)ptr; blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); - *be16ptr++ = cpu_to_be16(ec); + *be16ptr++ = 0; /* "inner" token header always uses 0 for RRC */ - *be16ptr++ = cpu_to_be16(0); + *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; spin_lock(&krb5_seq_lock); *be64ptr = cpu_to_be64(kctx->seq_send64++); spin_unlock(&krb5_seq_lock); - err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); + err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) return err; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 4ce5eccec1f6..c548ab213f76 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -886,7 +886,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs u32 priv_len, maj_stat; int pad, saved_len, remaining_len, offset; - rqstp->rq_splice_ok = 0; + rqstp->rq_splice_ok = false; priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index f0ebe07978a2..712c123e04e9 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth) static struct rpc_cred * nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { + if (flags & RPCAUTH_LOOKUP_RCU) + return &null_cred; return get_rpccred(&null_cred); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2e6ab10734f6..488ddeed9363 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -ENOBUFS: case -EPIPE: dprintk("RPC: %5u remote rpcbind unreachable: %d\n", task->tk_pid, task->tk_status); @@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task) case -ECONNABORTED: case -ENETUNREACH: case -EHOSTUNREACH: + case -ENOBUFS: + case -EPIPE: if (RPC_IS_SOFTCONN(task)) break; /* retry with existing socket, after a delay */ @@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: + case -ENOBUFS: case -EPIPE: rpc_task_force_reencode(task); } @@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: rpc_force_rebind(clnt); + case -ENOBUFS: rpc_delay(task, 3*HZ); case -EPIPE: case -ENOTCONN: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index b18554898562..2d12b76b5a64 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -195,7 +195,7 @@ static struct inode * rpc_alloc_inode(struct super_block *sb) { struct rpc_inode *rpci; - rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); + rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); if (!rpci) return NULL; return &rpci->vfs_inode; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5de6801cd924..1db5007ddbce 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1086,9 +1086,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off only in gss privacy case: */ - rqstp->rq_splice_ok = 1; + rqstp->rq_splice_ok = true; /* Will be turned off only when NFSv4 Sessions are used */ - rqstp->rq_usedeferral = 1; + rqstp->rq_usedeferral = true; rqstp->rq_dropme = false; /* Setup reply header */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b4737fbdec13..6666c6745858 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -23,6 +23,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static void svc_age_temp_xprts(unsigned long closure); static void svc_delete_xprt(struct svc_xprt *xprt); +static void svc_xprt_do_enqueue(struct svc_xprt *xprt); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -222,11 +223,12 @@ static void svc_xprt_received(struct svc_xprt *xprt) if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) return; /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_enqueue with: + * 'put', so we need a reference to call svc_xprt_do_enqueue with: */ svc_xprt_get(xprt); + smp_mb__before_atomic(); clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); + svc_xprt_do_enqueue(xprt); svc_xprt_put(xprt); } @@ -335,12 +337,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) return false; } -/* - * Queue up a transport with data pending. If there are idle nfsd - * processes, wake 'em up. - * - */ -void svc_xprt_enqueue(struct svc_xprt *xprt) +static void svc_xprt_do_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp; @@ -398,6 +395,18 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) out_unlock: spin_unlock_bh(&pool->sp_lock); } + +/* + * Queue up a transport with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +void svc_xprt_enqueue(struct svc_xprt *xprt) +{ + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) + return; + svc_xprt_do_enqueue(xprt); +} EXPORT_SYMBOL_GPL(svc_xprt_enqueue); /* @@ -439,6 +448,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space) atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; + if (xprt->xpt_ops->xpo_adjust_wspace) + xprt->xpt_ops->xpo_adjust_wspace(xprt); svc_xprt_enqueue(xprt); } } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b507cd327d9b..c24a8ff33f8f 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -446,15 +446,43 @@ static void svc_write_space(struct sock *sk) } } +static int svc_tcp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) + return 1; + required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; + if (sk_stream_wspace(svsk->sk_sk) >= required || + (sk_stream_min_wspace(svsk->sk_sk) == 0 && + atomic_read(&xprt->xpt_reserved) == 0)) + return 1; + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 0; +} + static void svc_tcp_write_space(struct sock *sk) { + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); struct socket *sock = sk->sk_socket; - if (sk_stream_is_writeable(sk) && sock) + if (!sk_stream_is_writeable(sk) || !sock) + return; + if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt)) clear_bit(SOCK_NOSPACE, &sock->flags); svc_write_space(sk); } +static void svc_tcp_adjust_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + if (svc_tcp_has_wspace(xprt)) + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -692,6 +720,7 @@ static struct svc_xprt_class svc_udp_class = { .xcl_owner = THIS_MODULE, .xcl_ops = &svc_udp_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, + .xcl_ident = XPRT_TRANSPORT_UDP, }; static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) @@ -1197,23 +1226,6 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) svc_putnl(resv, 0); } -static int svc_tcp_has_wspace(struct svc_xprt *xprt) -{ - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int required; - - if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) - return 1; - required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; - if (sk_stream_wspace(svsk->sk_sk) >= required || - (sk_stream_min_wspace(svsk->sk_sk) == 0 && - atomic_read(&xprt->xpt_reserved) == 0)) - return 1; - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - return 0; -} - static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -1285,6 +1297,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_adjust_wspace = svc_tcp_adjust_wspace, }; static struct svc_xprt_class svc_tcp_class = { @@ -1292,6 +1305,7 @@ static struct svc_xprt_class svc_tcp_class = { .xcl_owner = THIS_MODULE, .xcl_ops = &svc_tcp_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, + .xcl_ident = XPRT_TRANSPORT_TCP, }; void svc_init_xprt_sock(void) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 23fb4e75e245..290af97bf6f9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -509,7 +509,8 @@ void xdr_commit_encode(struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_commit_encode); -__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) +static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, + size_t nbytes) { static __be32 *p; int space_left; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index c3b2b3369e52..56e4e150e80e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task) case -ECONNABORTED: case -ENETUNREACH: case -EHOSTUNREACH: + case -EPIPE: case -EAGAIN: dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; @@ -1306,7 +1307,7 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) } } spin_unlock(&xprt_list_lock); - printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident); + dprintk("RPC: transport (%d) not supported\n", args->ident); return ERR_PTR(-EIO); found: diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 693966d3f33b..6166c985fe24 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,14 +53,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - #ifdef RPC_DEBUG static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); + if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt); + } return n; } /* + * Marshal chunks. This routine returns the header length + * consumed by marshaling. + * + * Returns positive RPC/RDMA header size, or negative errno. + */ + +ssize_t +rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; + + if (req->rl_rtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, req->rl_rtype); + else if (req->rl_wtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, req->rl_wtype); + return result; +} + +/* * Copy write data inline. * This function is used for "small" requests. Data which is passed * to RPC via iovecs (or page list) is copied directly into the @@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; - enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; + req->rl_wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - wtype = rpcrdma_writech; + req->rl_wtype = rpcrdma_writech; else - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - rtype = rpcrdma_noch; + req->rl_rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; + req->rl_rtype = rpcrdma_areadch; else - rtype = rpcrdma_readch; + req->rl_rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) - wtype = rpcrdma_noch; - if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) + req->rl_wtype = rpcrdma_noch; + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (rtype == rpcrdma_noch) { + if (req->rl_rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { + if (req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; + if (req->rl_wtype == rpcrdma_noch) + req->rl_wtype = rpcrdma_replych; } } - /* - * Marshal chunks. This routine will return the header length - * consumed by marshaling. - */ - if (rtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_snd_buf, headerp, rtype); - wtype = rtype; /* simplify dprintk */ - - } else if (wtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_rcv_buf, headerp, wtype); - } + hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); /* diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 8f92a61ee2df..e0110270d650 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -43,6 +43,7 @@ #include <linux/sunrpc/debug.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/spinlock.h> +#include <linux/highmem.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -435,6 +436,32 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, return ret; } +/* + * To avoid a separate RDMA READ just for a handful of zero bytes, + * RFC 5666 section 3.7 allows the client to omit the XDR zero pad + * in chunk lists. + */ +static void +rdma_fix_xdr_pad(struct xdr_buf *buf) +{ + unsigned int page_len = buf->page_len; + unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len; + unsigned int offset, pg_no; + char *p; + + if (size == 0) + return; + + pg_no = page_len >> PAGE_SHIFT; + offset = page_len & ~PAGE_MASK; + p = page_address(buf->pages[pg_no]); + memset(p + offset, 0, size); + + buf->page_len += size; + buf->buflen += size; + buf->len += size; +} + static int rdma_read_complete(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head) { @@ -449,6 +476,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } /* Point rq_arg.pages past header */ + rdma_fix_xdr_pad(&head->arg); rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; rqstp->rq_arg.page_base = head->arg.page_base; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 49fd21a5c215..9f1b50689c0f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -192,6 +192,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, xdr_sge_no++; BUG_ON(xdr_sge_no > vec->count); bc -= sge_bytes; + if (sge_no == xprt->sc_max_sge) + break; } /* Prepare WRITE WR */ @@ -209,7 +211,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, atomic_inc(&rdma_stat_write); if (svc_rdma_send(xprt, &write_wr)) goto err; - return 0; + return write_len - bc; err: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 0); @@ -225,7 +227,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, { u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; int write_len; - int max_write; u32 xdr_off; int chunk_off; int chunk_no; @@ -239,8 +240,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - max_write = xprt->sc_max_sge * PAGE_SIZE; - /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; xfer_len && chunk_no < arg_ary->wc_nchunks; @@ -260,23 +259,21 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, write_len); chunk_off = 0; while (write_len) { - int this_write; - this_write = min(write_len, max_write); ret = send_write(xprt, rqstp, ntohl(arg_ch->rs_handle), rs_offset + chunk_off, xdr_off, - this_write, + write_len, vec); - if (ret) { + if (ret <= 0) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); return -EIO; } - chunk_off += this_write; - xdr_off += this_write; - xfer_len -= this_write; - write_len -= this_write; + chunk_off += ret; + xdr_off += ret; + xfer_len -= ret; + write_len -= ret; } } /* Update the req with the number of chunks actually used */ @@ -293,7 +290,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, { u32 xfer_len = rqstp->rq_res.len; int write_len; - int max_write; u32 xdr_off; int chunk_no; int chunk_off; @@ -311,8 +307,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - max_write = xprt->sc_max_sge * PAGE_SIZE; - /* xdr offset starts at RPC message */ nchunks = ntohl(arg_ary->wc_nchunks); for (xdr_off = 0, chunk_no = 0; @@ -330,24 +324,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, write_len); chunk_off = 0; while (write_len) { - int this_write; - - this_write = min(write_len, max_write); ret = send_write(xprt, rqstp, ntohl(ch->rs_handle), rs_offset + chunk_off, xdr_off, - this_write, + write_len, vec); - if (ret) { + if (ret <= 0) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); return -EIO; } - chunk_off += this_write; - xdr_off += this_write; - xfer_len -= this_write; - write_len -= this_write; + chunk_off += ret; + xdr_off += ret; + xfer_len -= ret; + write_len -= ret; } } /* Update the req with the number of chunks actually used */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e7323fbbd348..374feb44afea 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -92,6 +92,7 @@ struct svc_xprt_class svc_rdma_class = { .xcl_owner = THIS_MODULE, .xcl_ops = &svc_rdma_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, + .xcl_ident = XPRT_TRANSPORT_RDMA, }; struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) @@ -942,23 +943,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { - /* - * XXX: This is a hack. We need a xx_request_qp interface - * that will adjust the qp_attr's with a best-effort - * number - */ - qp_attr.cap.max_send_sge -= 2; - qp_attr.cap.max_recv_sge -= 2; - ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, - &qp_attr); - if (ret) { - dprintk("svcrdma: failed to create QP, ret=%d\n", ret); - goto errout; - } - newxprt->sc_max_sge = qp_attr.cap.max_send_sge; - newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; - newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; - newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; } newxprt->sc_qp = newxprt->sc_cm_id->qp; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 66f91f0d071a..2faac4940563 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ - xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; xprt->ops = &xprt_rdma_procs; /* @@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args) new_ep->rep_xprt = xprt; xprt_rdma_format_addresses(xprt); + xprt->max_payload = rpcrdma_max_payload(new_xprt); + dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", + __func__, xprt->max_payload); if (!try_module_get(THIS_MODULE)) goto out4; @@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt) if (r_xprt->rx_ep.rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } static void @@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; + int rc = 0; - if (req->rl_niovs == 0) { + if (req->rl_niovs == 0) rc = rpcrdma_marshal_req(rqst); - if (rc < 0) - goto failed_marshal; - } + else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + rc = rpcrdma_marshal_chunks(rqst, 0); + if (rc < 0) + goto failed_marshal; if (req->rl_reply == NULL) /* e.g. reconnection */ rpcrdma_recv_buffer_get(req); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 13dbd1c389ff..61c41298b4ea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -61,6 +61,8 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); + /* * internal functions */ @@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); -static inline void -rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) -{ - unsigned long flags; - - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); -} - static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { @@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) if (wc->wr_id == 0ULL) return; if (wc->status != IB_WC_SUCCESS) - return; - - if (wc->opcode == IB_WC_FAST_REG_MR) - frmr->r.frmr.state = FRMR_IS_VALID; - else if (wc->opcode == IB_WC_LOCAL_INV) - frmr->r.frmr.state = FRMR_IS_INVALID; + frmr->r.frmr.fr_state = FRMR_IS_STALE; } static int @@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) } static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc) +rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) { struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; @@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc) } out_schedule: - rpcrdma_schedule_tasklet(rep); + list_add_tail(&rep->rr_list, sched_list); } static int rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { + struct list_head sched_list; struct ib_wc *wcs; int budget, count, rc; + unsigned long flags; + INIT_LIST_HEAD(&sched_list); budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { wcs = ep->rep_recv_wcs; rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); if (rc <= 0) - return rc; + goto out_schedule; count = rc; while (count-- > 0) - rpcrdma_recvcq_process_wc(wcs++); + rpcrdma_recvcq_process_wc(wcs++, &sched_list); } while (rc == RPCRDMA_POLLSIZE && --budget); - return 0; + rc = 0; + +out_schedule: + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_splice_tail(&sched_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); + return rc; } /* @@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) rpcrdma_recvcq_poll(cq, ep); } +static void +rpcrdma_flush_cqs(struct rpcrdma_ep *ep) +{ + rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); + rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); +} + #ifdef RPC_DEBUG static const char * const conn[] = { "address resolved", @@ -323,8 +326,16 @@ static const char * const conn[] = { "rejected", "established", "disconnected", - "device removal" + "device removal", + "multicast join", + "multicast error", + "address change", + "timewait exit", }; + +#define CONNECTION_MSG(status) \ + ((status) < ARRAY_SIZE(conn) ? \ + conn[(status)] : "unrecognized connection error") #endif static int @@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV; connected: - dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", - __func__, - (event->event <= 11) ? conn[event->event] : - "unknown connection error", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - ep, event->event); atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); - break; + /*FALLTHROUGH*/ default: - dprintk("RPC: %s: unexpected CM event %d\n", - __func__, event->event); + dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", + __func__, &addr->sin_addr.s_addr, + ntohs(addr->sin_port), ep, + CONNECTION_MSG(event->event)); break; } @@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_id->device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); -#if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; -#else - rc = -ENOMEM; - goto out2; -#endif } } @@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) switch (memreg) { case RPCRDMA_FRMR: break; -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; -#endif case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; -#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: -#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; + rwlock_init(&ia->ri_qplock); return 0; out2: rdma_destroy_id(ia->ri_id); @@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " returned %i\n", __func__, rc); + rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct rdma_cm_id *id; + struct rdma_cm_id *id, *old; int rc = 0; int retry_count = 0; @@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc && rc != -ENOTCONN) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " status %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_ep_disconnect(ep, ia); + rpcrdma_flush_cqs(ep); + + if (ia->ri_memreg_strategy == RPCRDMA_FRMR) + rpcrdma_reset_frmrs(ia); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -905,9 +899,14 @@ retry: rc = -ENETUNREACH; goto out; } - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + + write_lock(&ia->ri_qplock); + old = ia->ri_id; ia->ri_id = id; + write_unlock(&ia->ri_qplock); + + rdma_destroy_qp(old); + rdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -974,13 +973,12 @@ out: * This call is not reentrant, and must not be made in parallel * on the same endpoint. */ -int +void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_flush_cqs(ep); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } +} + +static int +rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_DATA_SEGS, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr failed %i\n", + __func__, rc); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_free: + kfree(r); + return rc; +} + +static int +rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_frmr *f; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + f = &r->r.frmr; + + f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr " + "failed %i\n", __func__, rc); + goto out_free; + } + + f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_pgl)) { + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(f->fr_mr); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + + return 0; + +out_free: + kfree(r); return rc; } -/* - * Initialize buffer memory - */ int rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) @@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len, rlen, wlen; int i, rc; - struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ - len = buf->rb_max_requests * (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - default: - break; - } - /* allocate 1, 4 and 5 in one shot */ p = kzalloc(len, GFP_KERNEL); if (p == NULL) { dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", @@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, p += cdata->padding; INIT_LIST_HEAD(&buf->rb_mws); - r = (struct rpcrdma_mw *)p; + INIT_LIST_HEAD(&buf->rb_all); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: - for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - goto out; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_frmrs(ia, buf); + if (rc) + goto out; break; case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - static struct ib_fmr_attr fa = - { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; - r->r.fmr = ib_alloc_fmr(ia->ri_pd, - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, - &fa); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_fmrs(ia, buf); + if (rc) + goto out; break; default: break; @@ -1176,24 +1204,57 @@ out: return rc; } -/* - * Unregister and destroy buffer memory. Need to deal with - * partial initialization, so it's callable from failed create. - * Must be called before destroying endpoint, as registrations - * reference it. - */ +static void +rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + + kfree(r); + } +} + +static void +rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + kfree(r); + } +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *r; + int i; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) * 2. send mr memory (mr free, then kfree) - * 3. padding (if any) [moved to rpcrdma_ep_destroy] - * 4. arrays + * 3. MWs */ dprintk("RPC: %s: entering\n", __func__); @@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rpcrdma_destroy_frmrs(buf); + break; + case RPCRDMA_MTHCAFMR: + rpcrdma_destroy_fmrs(buf); + break; + default: + break; + } + + kfree(buf->rb_pool); +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct list_head *pos; + struct rpcrdma_mw *r; + int rc; + + list_for_each(pos, &buf->rb_all) { + r = list_entry(pos, struct rpcrdma_mw, mw_all); + + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + continue; + } + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); + continue; + } + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving + * some req segments uninitialized. + */ +static void +rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) +{ + if (*mw) { + list_add_tail(&(*mw)->mw_list, &buf->rb_mws); + *mw = NULL; + } +} + +/* Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ +static void +rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_mr_seg *seg1 = seg; + int i; + + for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) + rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); + rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); +} + +static void +rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + buf->rb_send_bufs[--buf->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } +} + +/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). + * Redo only the ib_post_send(). + */ +static void +rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); + + /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ + r->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)r; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", + __func__, r, r->r.frmr.fr_mr->rkey); + + read_lock(&ia->ri_qplock); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + r->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send failed, %i\n", + __func__, rc); + } +} + +static void +rpcrdma_retry_flushed_linv(struct list_head *stale, + struct rpcrdma_buffer *buf) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct list_head *pos; + struct rpcrdma_mw *r; + unsigned long flags; + + list_for_each(pos, stale) { + r = list_entry(pos, struct rpcrdma_mw, mw_list); + rpcrdma_retry_local_inv(r, ia); + } + + spin_lock_irqsave(&buf->rb_lock, flags); + list_splice_tail(stale, &buf->rb_mws); + spin_unlock_irqrestore(&buf->rb_lock, flags); +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, + struct list_head *stale) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; while (!list_empty(&buf->rb_mws)) { r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); + struct rpcrdma_mw, mw_list); list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - default: - break; + if (r->r.frmr.fr_state == FRMR_IS_STALE) { + list_add(&r->mw_list, stale); + continue; } + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ } - kfree(buf->rb_pool); + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; } /* @@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + struct list_head stale; struct rpcrdma_req *req; unsigned long flags; - int i; - struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; - if (!list_empty(&buffers->rb_mws)) { - i = RPCRDMA_MAX_SEGS - 1; - do { - r = list_entry(buffers->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - req->rl_segments[i].mr_chunk.rl_mw = r; - } while (--i >= 0); + + INIT_LIST_HEAD(&stale); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); + break; + case RPCRDMA_MTHCAFMR: + req = rpcrdma_buffer_get_fmrs(req, buffers); + break; + default: + break; } spin_unlock_irqrestore(&buffers->rb_lock, flags); + if (!list_empty(&stale)) + rpcrdma_retry_flushed_linv(&stale, buffers); return req; } @@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - int i; unsigned long flags; spin_lock_irqsave(&buffers->rb_lock, flags); - buffers->rb_send_bufs[--buffers->rb_send_index] = req; - req->rl_niovs = 0; - if (req->rl_reply) { - buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - req->rl_reply->rr_func = NULL; - req->rl_reply = NULL; - } + rpcrdma_buffer_put_sendbuf(req, buffers); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - /* - * Cycle mw's back in reverse order, and "spin" them. - * This delays and scrambles reuse as much as possible. - */ - i = 1; - do { - struct rpcrdma_mw **mw; - mw = &req->rl_segments[i].mr_chunk.rl_mw; - list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); - *mw = NULL; - } while (++i < RPCRDMA_MAX_SEGS); - list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, - &buffers->rb_mws); - req->rl_segments[0].mr_chunk.rl_mw = NULL; + rpcrdma_buffer_put_mrs(req, buffers); break; default: break; @@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, */ iov->addr = ib_dma_map_single(ia->ri_id->device, va, len, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) + return -ENOMEM; + iov->length = len; if (ia->ri_have_dma_lkey) { @@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; - + struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; u8 key; int len, pageoff; int i, rc; @@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - seg1->mr_chunk.rl_mw->r.frmr.fr_pgl-> - page_list[page_no++] = pa; + frmr->fr_pgl->page_list[page_no++] = pa; pa += PAGE_SIZE; } len += seg->mr_len; @@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, break; } dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, seg1->mr_chunk.rl_mw, i); - - if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { - dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", - __func__, - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); - /* Invalidate before using. */ - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; - invalidate_wr.next = &frmr_wr; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; - invalidate_wr.ex.invalidate_rkey = - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - post_wr = &invalidate_wr; - } else - post_wr = &frmr_wr; - - /* Prepare FRMR WR */ - memset(&frmr_wr, 0, sizeof frmr_wr); - frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; - frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.send_flags = IB_SEND_SIGNALED; - frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; - frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; - frmr_wr.wr.fast_reg.page_list_len = page_no; - frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (frmr_wr.wr.fast_reg.length < len) { - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - return -EIO; + __func__, mw, i); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; + if (fastreg_wr.wr.fast_reg.length < len) { + rc = -EIO; + goto out_err; } /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); - frmr_wr.wr.fast_reg.access_flags = (writing ? + fastreg_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); - frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); - + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); + ib_update_fast_reg_key(mr, --key); + goto out_err; } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_rkey = mr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; } *nsegs = i; + return 0; +out_err: + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(ia, --seg); return rc; } @@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct ib_send_wr invalidate_wr, *bad_wr; int rc; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof invalidate_wr); invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - if (rc) + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; dprintk("RPC: %s: failed ib_post_send for invalidate," " status %i\n", __func__, rc); + } return rc; } @@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); while (seg1->mr_nsegs--) rpcrdma_unmap_one(ia, seg++); + read_unlock(&ia->ri_qplock); if (rc) dprintk("RPC: %s: failed ib_unmap_fmr," " status %i\n", __func__, rc); @@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: rpcrdma_map_one(ia, seg, writing); seg->mr_rkey = ia->ri_bind_mem->rkey; @@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, seg->mr_nsegs = 1; nsegs = 1; break; -#endif /* Registration using frmr registration */ case RPCRDMA_FRMR: @@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: + read_lock(&ia->ri_qplock); rpcrdma_unmap_one(ia, seg); + read_unlock(&ia->ri_qplock); break; -#endif case RPCRDMA_FRMR: rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); @@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, rc); return rc; } + +/* Physical mapping means one Read/Write list entry per-page. + * All list entries must fit within an inline buffer + * + * NB: The server must return a Write list for NFS READ, + * which has the same constraint. Factor in the inline + * rsize as well. + */ +static size_t +rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + unsigned int inline_size, pages; + + inline_size = min_t(unsigned int, + cdata->inline_wsize, cdata->inline_rsize); + inline_size -= RPCRDMA_HDRLEN_MIN; + pages = inline_size / sizeof(struct rpcrdma_segment); + return pages << PAGE_SHIFT; +} + +static size_t +rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) +{ + return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; +} + +size_t +rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) +{ + size_t result; + + switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_ALLPHYSICAL: + result = rpcrdma_physical_max_payload(r_xprt); + break; + default: + result = rpcrdma_mr_max_payload(r_xprt); + } + return result; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 89e7cd479705..c419498b8f46 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -59,6 +59,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; @@ -98,6 +99,14 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -137,6 +146,40 @@ struct rpcrdma_rep { }; /* + * struct rpcrdma_mw - external memory region metadata + * + * An external memory region is any buffer or page that is registered + * on the fly (ie, not pre-registered). + * + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * call_allocate, rpcrdma_buffer_get() assigns one to each segment in + * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep + * track of registration metadata while each RPC is pending. + * rpcrdma_deregister_external() uses this metadata to unmap and + * release these resources when an RPC is complete. + */ +enum rpcrdma_frmr_state { + FRMR_IS_INVALID, /* ready to be used */ + FRMR_IS_VALID, /* in use */ + FRMR_IS_STALE, /* failed completion */ +}; + +struct rpcrdma_frmr { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + enum rpcrdma_frmr_state fr_state; +}; + +struct rpcrdma_mw { + union { + struct ib_fmr *fmr; + struct rpcrdma_frmr frmr; + } r; + struct list_head mw_list; + struct list_head mw_all; +}; + +/* * struct rpcrdma_req -- structure central to the request/reply sequence. * * N of these are associated with a transport instance, and stored in @@ -163,17 +206,7 @@ struct rpcrdma_rep { struct rpcrdma_mr_seg { /* chunk descriptors */ union { /* chunk memory handles */ struct ib_mr *rl_mr; /* if registered directly */ - struct rpcrdma_mw { /* if registered from region */ - union { - struct ib_fmr *fmr; - struct { - struct ib_fast_reg_page_list *fr_pgl; - struct ib_mr *fr_mr; - enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; - } frmr; - } r; - struct list_head mw_list; - } *rl_mw; + struct rpcrdma_mw *rl_mw; /* if registered from region */ } mr_chunk; u64 mr_base; /* registration result */ u32 mr_rkey; /* registration result */ @@ -191,6 +224,7 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ + enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -214,6 +248,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + struct list_head rb_all; int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); @@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ +ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); +size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index be8bbd5d65ec..43cd89eacfab 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task) } switch (status) { + case -ENOBUFS: case -EAGAIN: status = xs_nospace(task); break; @@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task) dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); case -ENETUNREACH: + case -ENOBUFS: case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message @@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task) status = -ENOTCONN; /* Should we call xs_close() here? */ break; + case -ENOBUFS: case -EAGAIN: status = xs_nospace(task); break; @@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); xprt_set_connected(xprt); + case -ENOBUFS: break; case -ENOENT: dprintk("RPC: xprt %p: socket %s does not exist\n", @@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + case -ENOBUFS: /* retry with existing socket, after a delay */ goto out; } @@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max) { - unsigned long num; + unsigned int num; int ret; if (!val) return -EINVAL; - ret = strict_strtoul(val, 0, &num); + ret = kstrtouint(val, 0, &num); if (ret == -EINVAL || num < min || num > max) return -EINVAL; *((unsigned int *)kp->arg) = num; diff --git a/net/tipc/port.h b/net/tipc/port.h index 3f93454592b6..3087da39ee47 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -179,9 +179,12 @@ static inline int tipc_port_importance(struct tipc_port *port) return msg_importance(&port->phdr); } -static inline void tipc_port_set_importance(struct tipc_port *port, int imp) +static inline int tipc_port_set_importance(struct tipc_port *port, int imp) { + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; msg_set_importance(&port->phdr, (u32)imp); + return 0; } #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7d423ee10897..ff8c8118d56e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1973,7 +1973,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - tipc_port_set_importance(port, value); + res = tipc_port_set_importance(port, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index df7b1332a1ec..7257164af91b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6969,6 +6969,9 @@ void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp) struct nlattr *data = ((void **)skb->cb)[2]; enum nl80211_multicast_groups mcgrp = NL80211_MCGRP_TESTMODE; + /* clear CB data for netlink core to own from now on */ + memset(skb->cb, 0, sizeof(skb->cb)); + nla_nest_end(skb, data); genlmsg_end(skb, hdr); @@ -9294,6 +9297,9 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb) void *hdr = ((void **)skb->cb)[1]; struct nlattr *data = ((void **)skb->cb)[2]; + /* clear CB data for netlink core to own from now on */ + memset(skb->cb, 0, sizeof(skb->cb)); + if (WARN_ON(!rdev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0525d78ba328..fdde51f4271a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -39,6 +39,11 @@ #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 +struct xfrm_flo { + struct dst_entry *dst_orig; + u8 flags; +}; + static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] __read_mostly; @@ -389,7 +394,7 @@ redo: if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -654,7 +659,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); @@ -1877,13 +1882,14 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, - struct dst_entry *dst, + struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; + struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; @@ -1891,9 +1897,12 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, if (IS_ERR(xdst)) return xdst; - if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0) + if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || + net->xfrm.sysctl_larval_drop || + num_xfrms <= 0) return xdst; + dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; @@ -1935,7 +1944,7 @@ static struct flow_cache_object * xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct flow_cache_object *oldflo, void *ctx) { - struct dst_entry *dst_orig = (struct dst_entry *)ctx; + struct xfrm_flo *xflo = (struct xfrm_flo *)ctx; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst, *new_xdst; int num_pols = 0, num_xfrms = 0, i, err, pol_dead; @@ -1976,7 +1985,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, goto make_dummy_bundle; } - new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig); + new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, + xflo->dst_orig); if (IS_ERR(new_xdst)) { err = PTR_ERR(new_xdst); if (err != -EAGAIN) @@ -2010,7 +2020,7 @@ make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ - xdst = xfrm_create_dummy_bundle(net, dst_orig, fl, num_xfrms, family); + xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); @@ -2104,13 +2114,18 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, } if (xdst == NULL) { + struct xfrm_flo xflo; + + xflo.dst_orig = dst_orig; + xflo.flags = flags; + /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; flo = flow_cache_lookup(net, fl, family, dir, - xfrm_bundle_lookup, dst_orig); + xfrm_bundle_lookup, &xflo); if (flo == NULL) goto nopol; if (IS_ERR(flo)) { @@ -2138,7 +2153,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, xfrm_pols_put(pols, drop_pols); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - return make_blackhole(net, family, dst_orig); + return ERR_PTR(-EREMOTE); } err = -EAGAIN; @@ -2195,6 +2210,23 @@ dropdst: } EXPORT_SYMBOL(xfrm_lookup); +/* Callers of xfrm_lookup_route() must ensure a call to dst_output(). + * Otherwise we may send out blackholed packets. + */ +struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, + struct sock *sk, int flags) +{ + struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, + flags | XFRM_LOOKUP_QUEUE); + + if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) + return make_blackhole(net, dst_orig->ops->family, dst_orig); + + return dst; +} +EXPORT_SYMBOL(xfrm_lookup_route); + static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { @@ -2460,7 +2492,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) skb_dst_force(skb); - dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0); + dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; |