diff options
Diffstat (limited to 'net')
150 files changed, 2689 insertions, 1435 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 73a2a83ee2da..402442402af7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -137,9 +137,21 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, return rc; } +static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) +{ +#ifdef CONFIG_NET_POLL_CONTROLLER + if (vlan->netpoll) + netpoll_send_skb(vlan->netpoll, skb); +#else + BUG(); +#endif + return NETDEV_TX_OK; +} + static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; @@ -150,29 +162,30 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (veth->h_vlan_proto != htons(ETH_P_8021Q) || - vlan_dev_priv(dev)->flags & VLAN_FLAG_REORDER_HDR) { + vlan->flags & VLAN_FLAG_REORDER_HDR) { u16 vlan_tci; - vlan_tci = vlan_dev_priv(dev)->vlan_id; + vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); skb = __vlan_hwaccel_put_tag(skb, vlan_tci); } - skb->dev = vlan_dev_priv(dev)->real_dev; + skb->dev = vlan->real_dev; len = skb->len; - if (netpoll_tx_running(dev)) - return skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev); + if (unlikely(netpoll_tx_running(dev))) + return vlan_netpoll_send_skb(vlan, skb); + ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; - stats = this_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats); + stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); stats->tx_packets++; stats->tx_bytes += len; u64_stats_update_end(&stats->syncp); } else { - this_cpu_inc(vlan_dev_priv(dev)->vlan_pcpu_stats->tx_dropped); + this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; @@ -669,25 +682,26 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, + gfp_t gfp) { - struct vlan_dev_priv *info = vlan_dev_priv(dev); - struct net_device *real_dev = info->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc(sizeof(*netpoll), gfp); err = -ENOMEM; if (!netpoll) goto out; - err = __netpoll_setup(netpoll, real_dev); + err = __netpoll_setup(netpoll, real_dev, gfp); if (err) { kfree(netpoll); goto out; } - info->netpoll = netpoll; + vlan->netpoll = netpoll; out: return err; @@ -695,19 +709,15 @@ out: static void vlan_dev_netpoll_cleanup(struct net_device *dev) { - struct vlan_dev_priv *info = vlan_dev_priv(dev); - struct netpoll *netpoll = info->netpoll; + struct vlan_dev_priv *vlan= vlan_dev_priv(dev); + struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; - info->netpoll = NULL; - - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); + vlan->netpoll = NULL; - __netpoll_cleanup(netpoll); - kfree(netpoll); + __netpoll_free_rcu(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/atm/common.c b/net/atm/common.c index b4b44dbed645..0c0ad930a632 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -812,6 +812,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = vcc->dev->number; pvc.sap_addr.vpi = vcc->vpi; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 3a734919c36c..ae0324021407 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -95,6 +95,7 @@ static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, return -ENOTCONN; *sockaddr_len = sizeof(struct sockaddr_atmpvc); addr = (struct sockaddr_atmpvc *)sockaddr; + memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index a081ce1c0514..cebaae7e148b 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -20,8 +20,8 @@ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ #define _NET_BATMAN_ADV_BITARRAY_H_ -/* returns true if the corresponding bit in the given seq_bits indicates true - * and curr_seqno is within range of last_seqno +/* Returns 1 if the corresponding bit in the given seq_bits indicates true + * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, uint32_t last_seqno, uint32_t curr_seqno) @@ -32,7 +32,7 @@ static inline int batadv_test_bit(const unsigned long *seq_bits, if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE) return 0; else - return test_bit(diff, seq_bits); + return test_bit(diff, seq_bits) != 0; } /* turn corresponding bit on, so we can remember that we got the packet */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index b421cc49d2cd..fc866f2e4528 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -200,11 +200,11 @@ void batadv_gw_election(struct batadv_priv *bat_priv) if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) goto out; - if (!batadv_atomic_dec_not_zero(&bat_priv->gw_reselect)) - goto out; - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + if (!batadv_atomic_dec_not_zero(&bat_priv->gw_reselect) && curr_gw) + goto out; + next_gw = batadv_gw_get_best_gw_node(bat_priv); if (curr_gw == next_gw) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a438f4b582fc..99dd8f75b3ff 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -197,6 +197,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, del: list_del(&entry->list); kfree(entry); + kfree(tt_change_node); event_removed = true; goto unlock; } diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 5e5f5b410e0b..1eaacf10d19d 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -58,7 +58,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long switch (cmd) { case BNEPCONNADD: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; @@ -84,7 +84,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long case BNEPCONNDEL: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index 311668d14571..32dc83dcb6b2 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -72,7 +72,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long switch (cmd) { case CMTPCONNADD: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; @@ -97,7 +97,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long case CMTPCONNDEL: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5ad7da217474..3c094e78dde9 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -29,6 +29,7 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/a2mp.h> +#include <net/bluetooth/smp.h> static void hci_le_connect(struct hci_conn *conn) { @@ -619,6 +620,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("hcon %p", conn); + if (conn->type == LE_LINK) + return smp_conn_security(conn, sec_level); + /* For sdp we don't need the link key. */ if (sec_level == BT_SECURITY_SDP) return 1; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 41ff978a33f9..715d7e33fba0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1365,6 +1365,9 @@ static bool hci_resolve_next_name(struct hci_dev *hdev) return false; e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED); + if (!e) + return false; + if (hci_resolve_name(hdev, e) == 0) { e->name_state = NAME_PENDING; return true; @@ -1393,12 +1396,20 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, return; e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING); - if (e) { + /* If the device was not found in a list of found devices names of which + * are pending. there is no need to continue resolving a next name as it + * will be done upon receiving another Remote Name Request Complete + * Event */ + if (!e) + return; + + list_del(&e->list); + if (name) { e->name_state = NAME_KNOWN; - list_del(&e->list); - if (name) - mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, - e->data.rssi, name, name_len); + mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, + e->data.rssi, name, name_len); + } else { + e->name_state = NAME_NOT_KNOWN; } if (hci_resolve_next_name(hdev)) @@ -1762,7 +1773,12 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; hci_conn_hold(conn); - conn->disc_timeout = HCI_DISCONN_TIMEOUT; + + if (!conn->out && !hci_conn_ssp_enabled(conn) && + !hci_find_link_key(hdev, &ev->bdaddr)) + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + else + conn->disc_timeout = HCI_DISCONN_TIMEOUT; } else conn->state = BT_CONNECTED; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index a7f04de03d79..d5ace1eda3ed 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -490,7 +490,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, switch (cmd) { case HCISETRAW: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return -EPERM; @@ -510,12 +510,12 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_sock_blacklist_add(hdev, (void __user *) arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_sock_blacklist_del(hdev, (void __user *) arg); default: @@ -546,22 +546,22 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, case HCIDEVUP: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_open(arg); case HCIDEVDOWN: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_close(arg); case HCIDEVRESET: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_reset(arg); case HCIDEVRESTAT: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_reset_stat(arg); case HCISETSCAN: @@ -573,7 +573,7 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, case HCISETACLMTU: case HCISETSCOMTU: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_cmd(cmd, argp); case HCIINQUIRY: @@ -694,6 +694,7 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, *addr_len = sizeof(*haddr); haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; + haddr->hci_channel= 0; release_sock(sk); return 0; @@ -1009,6 +1010,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, { struct hci_filter *f = &hci_pi(sk)->filter; + memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 18b3f6892a36..b24fb3bd8625 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -56,7 +56,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long switch (cmd) { case HIDPCONNADD: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; @@ -91,7 +91,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long case HIDPCONNDEL: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a8964db04bfb..4ea1710a4783 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1181,6 +1181,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) sk = chan->sk; hci_conn_hold(conn->hcon); + conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT; bacpy(&bt_sk(sk)->src, conn->src); bacpy(&bt_sk(sk)->dst, conn->dst); @@ -1198,14 +1199,15 @@ clean: static void l2cap_conn_ready(struct l2cap_conn *conn) { struct l2cap_chan *chan; + struct hci_conn *hcon = conn->hcon; BT_DBG("conn %p", conn); - if (!conn->hcon->out && conn->hcon->type == LE_LINK) + if (!hcon->out && hcon->type == LE_LINK) l2cap_le_conn_ready(conn); - if (conn->hcon->out && conn->hcon->type == LE_LINK) - smp_conn_security(conn, conn->hcon->pending_sec_level); + if (hcon->out && hcon->type == LE_LINK) + smp_conn_security(hcon, hcon->pending_sec_level); mutex_lock(&conn->chan_lock); @@ -1218,8 +1220,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) continue; } - if (conn->hcon->type == LE_LINK) { - if (smp_conn_security(conn, chan->sec_level)) + if (hcon->type == LE_LINK) { + if (smp_conn_security(hcon, chan->sec_level)) l2cap_chan_ready(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a4bb27e8427e..34bbe1c5e389 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -245,6 +245,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l BT_DBG("sock %p, sk %p", sock, sk); + memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); @@ -615,7 +616,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch break; } - if (smp_conn_security(conn, sec.level)) + if (smp_conn_security(conn->hcon, sec.level)) break; sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; @@ -1174,7 +1175,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p chan = l2cap_chan_create(); if (!chan) { - l2cap_sock_kill(sk); + sk_free(sk); return NULL; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 7e1e59645c05..1a17850d093c 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -528,6 +528,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * BT_DBG("sock %p, sk %p", sock, sk); + memset(sa, 0, sizeof(*sa)); sa->rc_family = AF_BLUETOOTH; sa->rc_channel = rfcomm_pi(sk)->channel; if (peer) @@ -822,6 +823,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c } sec.level = rfcomm_pi(sk)->sec_level; + sec.key_size = 0; len = min_t(unsigned int, len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index cb960773c002..56f182393c4c 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -456,7 +456,7 @@ static int rfcomm_get_dev_list(void __user *arg) size = sizeof(*dl) + dev_num * sizeof(*di); - dl = kmalloc(size, GFP_KERNEL); + dl = kzalloc(size, GFP_KERNEL); if (!dl) return -ENOMEM; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 40bbe25dcff7..3589e21edb09 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -131,6 +131,15 @@ static int sco_conn_del(struct hci_conn *hcon, int err) sco_sock_clear_timer(sk); sco_chan_del(sk, err); bh_unlock_sock(sk); + + sco_conn_lock(conn); + conn->sk = NULL; + sco_pi(sk)->conn = NULL; + sco_conn_unlock(conn); + + if (conn->hcon) + hci_conn_put(conn->hcon); + sco_sock_kill(sk); } @@ -821,16 +830,6 @@ static void sco_chan_del(struct sock *sk, int err) BT_DBG("sk %p, conn %p, err %d", sk, conn, err); - if (conn) { - sco_conn_lock(conn); - conn->sk = NULL; - sco_pi(sk)->conn = NULL; - sco_conn_unlock(conn); - - if (conn->hcon) - hci_conn_put(conn->hcon); - } - sk->sk_state = BT_CLOSED; sk->sk_err = err; sk->sk_state_change(sk); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 16ef0dc85a0a..8c225ef349cd 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -267,10 +267,10 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason, u8 send) mgmt_auth_failed(conn->hcon->hdev, conn->dst, hcon->type, hcon->dst_type, reason); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); + cancel_delayed_work_sync(&conn->security_timer); + + if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp_chan_destroy(conn); - } } #define JUST_WORKS 0x00 @@ -579,8 +579,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp = smp_chan_create(conn); + else + smp = conn->smp_chan; - smp = conn->smp_chan; + if (!smp) + return SMP_UNSPECIFIED; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); @@ -757,9 +760,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level) +int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { - struct hci_conn *hcon = conn->hcon; + struct l2cap_conn *conn = hcon->l2cap_data; struct smp_chan *smp = conn->smp_chan; __u8 authreq; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 333484537600..070e8a68cfc6 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -31,9 +31,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_mdb_entry *mdst; struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + rcu_read_lock(); #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); + rcu_read_unlock(); return NETDEV_TX_OK; } #endif @@ -48,7 +50,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - rcu_read_lock(); if (is_broadcast_ether_addr(dest)) br_flood_deliver(br, skb); else if (is_multicast_ether_addr(dest)) { @@ -206,24 +207,23 @@ static void br_poll_controller(struct net_device *br_dev) static void br_netpoll_cleanup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p, *n; + struct net_bridge_port *p; - list_for_each_entry_safe(p, n, &br->port_list, list) { + list_for_each_entry(p, &br->port_list, list) br_netpoll_disable(p); - } } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, + gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p, *n; + struct net_bridge_port *p; int err = 0; - list_for_each_entry_safe(p, n, &br->port_list, list) { + list_for_each_entry(p, &br->port_list, list) { if (!p->dev) continue; - - err = br_netpoll_enable(p); + err = br_netpoll_enable(p, gfp); if (err) goto fail; } @@ -236,17 +236,17 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p) +int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*p->np), GFP_KERNEL); + np = kzalloc(sizeof(*p->np), gfp); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, p->dev); + err = __netpoll_setup(np, p->dev, gfp); if (err) { kfree(np); goto out; @@ -267,11 +267,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } #endif diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e9466d412707..02015a505d2a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,7 +65,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; - if (unlikely(netpoll_tx_running(to->dev))) { + if (unlikely(netpoll_tx_running(to->br->dev))) { if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) kfree_skb(skb); else { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e1144e1617be..1c8fdc3558cd 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -361,7 +361,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err2; - if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) + if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; err = netdev_set_master(dev, br->dev); @@ -427,6 +427,10 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) if (!p || p->br != br) return -EINVAL; + /* Since more than one interface can be attached to a bridge, + * there still maybe an alternate path for netconsole to use; + * therefore there is no reason for a NETDEV_RELEASE event. + */ del_nbp(p); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a768b2408edf..f507d2af9646 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -316,7 +316,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, netpoll_send_skb(np, skb); } -extern int br_netpoll_enable(struct net_bridge_port *p); +extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp); extern void br_netpoll_disable(struct net_bridge_port *p); #else static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) @@ -329,7 +329,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, { } -static inline int br_netpoll_enable(struct net_bridge_port *p) +static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { return 0; } diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 6229b62749e8..13b36bdc76a7 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -27,7 +27,7 @@ struct brport_attribute { }; #define BRPORT_ATTR(_name,_mode,_show,_store) \ -struct brport_attribute brport_attr_##_name = { \ +const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ @@ -164,7 +164,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, store_multicast_router); #endif -static struct brport_attribute *brport_attrs[] = { +static const struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, &brport_attr_port_id, @@ -241,7 +241,7 @@ const struct sysfs_ops brport_sysfs_ops = { int br_sysfs_addif(struct net_bridge_port *p) { struct net_bridge *br = p->br; - struct brport_attribute **a; + const struct brport_attribute **a; int err; err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index f88ee537fb2b..92de5e5f9db2 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -80,7 +80,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum, unsigned int bitmask; spin_lock_bh(&ebt_log_lock); - printk("<%c>%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x", + printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x", '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : "", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 78f1cdad5b33..095259f83902 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) err = sk_filter(sk, skb); if (err) return err; - if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index dd485f6128e8..ba217e90765e 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -211,9 +211,10 @@ void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; - service = container_of(adapt_layer->dn, struct cfsrvl, layer); - WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL); + if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) + return; + service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 69771c04ba8f..e597733affb8 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -94,6 +94,10 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); + if (!ip_version) { + kfree_skb(skb); + return -EINVAL; + } switch (*ip_version >> 4) { case 4: diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ba4323bce0e9..a8020293f342 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -17,6 +17,7 @@ #include <linux/string.h> +#include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/debugfs.h> #include <linux/ceph/decode.h> @@ -83,7 +84,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) return -1; } } else { - pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); } return 0; @@ -460,27 +460,23 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, client->auth_err = 0; client->extra_mon_dispatch = NULL; - client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | + client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | supported_features; - client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | + client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | required_features; /* msgr */ if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; - client->msgr = ceph_messenger_create(myaddr, - client->supported_features, - client->required_features); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - goto fail; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); + ceph_messenger_init(&client->msgr, myaddr, + client->supported_features, + client->required_features, + ceph_test_opt(client, NOCRC)); /* subsystems */ err = ceph_monc_init(&client->monc, client); if (err < 0) - goto fail_msgr; + goto fail; err = ceph_osdc_init(&client->osdc, client); if (err < 0) goto fail_monc; @@ -489,8 +485,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, fail_monc: ceph_monc_stop(&client->monc); -fail_msgr: - ceph_messenger_destroy(client->msgr); fail: kfree(client); return ERR_PTR(err); @@ -501,6 +495,8 @@ void ceph_destroy_client(struct ceph_client *client) { dout("destroy_client %p\n", client); + atomic_set(&client->msgr.stopping, 1); + /* unmount */ ceph_osdc_stop(&client->osdc); @@ -508,8 +504,6 @@ void ceph_destroy_client(struct ceph_client *client) ceph_debugfs_client_cleanup(client); - ceph_messenger_destroy(client->msgr); - ceph_destroy_options(client->options); kfree(client); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index d7edc24333b8..35fce755ce10 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map, int item = 0; int itemtype; int collide, reject; - const unsigned int orig_tries = 5; /* attempts before we fall back to search */ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep); @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map, reject = 1; goto reject; } - if (flocal >= (in->size>>1) && - flocal > orig_tries) + if (map->choose_local_fallback_tries > 0 && + flocal >= (in->size>>1) && + flocal > map->choose_local_fallback_tries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -422,13 +422,14 @@ reject: ftotal++; flocal++; - if (collide && flocal < 3) + if (collide && flocal <= map->choose_local_tries) /* retry locally a few times */ retry_bucket = 1; - else if (flocal <= in->size + orig_tries) + else if (map->choose_local_fallback_tries > 0 && + flocal <= in->size + map->choose_local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal < 20) + else if (ftotal <= map->choose_total_tries) /* then retry descent */ retry_descent = 1; else diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index b780cb7947dd..9da7fdd3cd8a 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -466,6 +466,7 @@ void ceph_key_destroy(struct key *key) { struct ceph_crypto_key *ckey = key->payload.data; ceph_crypto_key_destroy(ckey); + kfree(ckey); } struct key_type key_type_ceph = { diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 1919d1550d75..3572dc518bc9 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -16,7 +16,8 @@ struct ceph_crypto_key { static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { - kfree(key->key); + if (key) + kfree(key->key); } extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 54b531a01121..38b5dc1823d4 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -189,6 +189,9 @@ int ceph_debugfs_client_init(struct ceph_client *client) snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, client->monc.auth->global_id); + dout("ceph_debugfs_client_init %p %s\n", client, name); + + BUG_ON(client->debugfs_dir); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) goto out; @@ -234,6 +237,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { + dout("ceph_debugfs_client_cleanup %p\n", client); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 10255e81be79..159aa8bef9e7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -29,6 +29,74 @@ * the sender. */ +/* + * We track the state of the socket on a given connection using + * values defined below. The transition to a new socket state is + * handled by a function which verifies we aren't coming from an + * unexpected state. + * + * -------- + * | NEW* | transient initial state + * -------- + * | con_sock_state_init() + * v + * ---------- + * | CLOSED | initialized, but no socket (and no + * ---------- TCP connection) + * ^ \ + * | \ con_sock_state_connecting() + * | ---------------------- + * | \ + * + con_sock_state_closed() \ + * |+--------------------------- \ + * | \ \ \ + * | ----------- \ \ + * | | CLOSING | socket event; \ \ + * | ----------- await close \ \ + * | ^ \ | + * | | \ | + * | + con_sock_state_closing() \ | + * | / \ | | + * | / --------------- | | + * | / \ v v + * | / -------------- + * | / -----------------| CONNECTING | socket created, TCP + * | | / -------------- connect initiated + * | | | con_sock_state_connected() + * | | v + * ------------- + * | CONNECTED | TCP connection established + * ------------- + * + * State values for ceph_connection->sock_state; NEW is assumed to be 0. + */ + +#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ +#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ +#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ +#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ +#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ + +/* + * connection states + */ +#define CON_STATE_CLOSED 1 /* -> PREOPEN */ +#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ +#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ +#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ +#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ +#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ + +/* + * ceph_connection flag bits + */ +#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop + * messages on errors */ +#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ +#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -147,72 +215,130 @@ void ceph_msgr_flush(void) } EXPORT_SYMBOL(ceph_msgr_flush); +/* Connection socket state transition functions */ + +static void con_sock_state_init(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} + +static void con_sock_state_connecting(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); + if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTING); +} + +static void con_sock_state_connected(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTED); +} + +static void con_sock_state_closing(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSING); +} + +static void con_sock_state_closed(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING && + old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ -static void ceph_data_ready(struct sock *sk, int count_unused) +static void ceph_sock_data_ready(struct sock *sk, int count_unused) { struct ceph_connection *con = sk->sk_user_data; + if (atomic_read(&con->msgr->stopping)) { + return; + } if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("ceph_data_ready on %p state = %lu, queueing work\n", + dout("%s on %p state = %lu, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ -static void ceph_write_space(struct sock *sk) +static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept - * more data. clear SOCK_NOSPACE so that ceph_write_space() + * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (test_bit(WRITE_PENDING, &con->state)) { + if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { - dout("ceph_write_space %p queueing write work\n", con); + dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { - dout("ceph_write_space %p nothing to write\n", con); + dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ -static void ceph_state_change(struct sock *sk) +static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; - dout("ceph_state_change %p state = %lu sk_state = %u\n", + dout("%s %p state = %lu sk_state = %u\n", __func__, con, con->state, sk->sk_state); - if (test_bit(CLOSED, &con->state)) - return; - switch (sk->sk_state) { case TCP_CLOSE: - dout("ceph_state_change TCP_CLOSE\n"); + dout("%s TCP_CLOSE\n", __func__); case TCP_CLOSE_WAIT: - dout("ceph_state_change TCP_CLOSE_WAIT\n"); - if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { - if (test_bit(CONNECTING, &con->state)) - con->error_msg = "connection failed"; - else - con->error_msg = "socket closed"; - queue_con(con); - } + dout("%s TCP_CLOSE_WAIT\n", __func__); + con_sock_state_closing(con); + set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + queue_con(con); break; case TCP_ESTABLISHED: - dout("ceph_state_change TCP_ESTABLISHED\n"); + dout("%s TCP_ESTABLISHED\n", __func__); + con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ @@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock, { struct sock *sk = sock->sk; sk->sk_user_data = con; - sk->sk_data_ready = ceph_data_ready; - sk->sk_write_space = ceph_write_space; - sk->sk_state_change = ceph_state_change; + sk->sk_data_ready = ceph_sock_data_ready; + sk->sk_write_space = ceph_sock_write_space; + sk->sk_state_change = ceph_sock_state_change; } @@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { @@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } con->sock = sock; - return 0; } @@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, */ static int con_close_socket(struct ceph_connection *con) { - int rc; + int rc = 0; dout("con_close_socket on %p sock %p\n", con, con->sock); - if (!con->sock) - return 0; - set_bit(SOCK_CLOSED, &con->state); - rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); - sock_release(con->sock); - con->sock = NULL; - clear_bit(SOCK_CLOSED, &con->state); + if (con->sock) { + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + } + + /* + * Forcibly clear the SOCK_CLOSED flag. It gets set + * independent of the connection mutex, and we could have + * received a socket close event before we had the chance to + * shut the socket down. + */ + clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + + con_sock_state_closed(con); return rc; } @@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con) static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; + ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) @@ -372,8 +510,11 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_remove_list(&con->out_sent); if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; + con->ops->put(con); } con->connect_seq = 0; @@ -391,32 +532,44 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { + mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr.in_addr)); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ - clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ - clear_bit(KEEPALIVE_PENDING, &con->state); - clear_bit(WRITE_PENDING, &con->state); - mutex_lock(&con->mutex); + con->state = CON_STATE_CLOSED; + + clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_BACKOFF, &con->flags); + reset_connection(con); con->peer_global_seq = 0; cancel_delayed_work(&con->work); + con_close_socket(con); mutex_unlock(&con->mutex); - queue_con(con); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +void ceph_con_open(struct ceph_connection *con, + __u8 entity_type, __u64 entity_num, + struct ceph_entity_addr *addr) { + mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); - set_bit(OPENING, &con->state); - clear_bit(CLOSED, &con->state); + + BUG_ON(con->state != CON_STATE_CLOSED); + con->state = CON_STATE_PREOPEN; + + con->peer_name.type = (__u8) entity_type; + con->peer_name.num = cpu_to_le64(entity_num); + memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ + mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); @@ -430,42 +583,26 @@ bool ceph_con_opened(struct ceph_connection *con) } /* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ - int nref = __atomic_add_unless(&con->nref, 1, 0); - - dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1); - - return nref ? con : NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ - int nref = atomic_dec_return(&con->nref); - - BUG_ON(nref < 0); - if (nref == 0) { - BUG_ON(con->sock); - kfree(con); - } - dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref); -} - -/* * initialize a new connection. */ -void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +void ceph_con_init(struct ceph_connection *con, void *private, + const struct ceph_connection_operations *ops, + struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); - atomic_set(&con->nref, 1); + con->private = private; + con->ops = ops; con->msgr = msgr; + + con_sock_state_init(con); + mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); + + con->state = CON_STATE_CLOSED; } EXPORT_SYMBOL(ceph_con_init); @@ -486,14 +623,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } -static void ceph_con_out_kvec_reset(struct ceph_connection *con) +static void con_out_kvec_reset(struct ceph_connection *con) { con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; } -static void ceph_con_out_kvec_add(struct ceph_connection *con, +static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { int index; @@ -507,6 +644,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + +static void prepare_write_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + BUG_ON(!msg); + BUG_ON(!msg->hdr.data_len); + + /* initialize page iterator */ + con->out_msg_pos.page = 0; + if (msg->pages) + con->out_msg_pos.page_pos = msg->page_alignment; + else + con->out_msg_pos.page_pos = 0; +#ifdef CONFIG_BLOCK + if (msg->bio) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = false; + con->out_more = 1; /* data + footer will follow */ +} + /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. @@ -516,6 +700,8 @@ static void prepare_write_message_footer(struct ceph_connection *con) struct ceph_msg *m = con->out_msg; int v = con->out_kvec_left; + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + dout("prepare_write_message_footer %p\n", con); con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; @@ -534,7 +720,7 @@ static void prepare_write_message(struct ceph_connection *con) struct ceph_msg *m; u32 crc; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); con->out_kvec_is_msg = true; con->out_msg_done = false; @@ -542,14 +728,16 @@ static void prepare_write_message(struct ceph_connection *con) * TCP packet that's a good thing. */ if (con->in_seq > con->in_seq_acked) { con->in_seq_acked = con->in_seq; - ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); } + BUG_ON(list_empty(&con->out_queue)); m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; + BUG_ON(m->con != con); /* put message on sent list */ ceph_msg_get(m); @@ -576,18 +764,18 @@ static void prepare_write_message(struct ceph_connection *con) BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ - ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); - ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) - ceph_con_out_kvec_add(con, m->middle->vec.iov_len, + con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.flags = 0; crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); @@ -597,28 +785,19 @@ static void prepare_write_message(struct ceph_connection *con) con->out_msg->footer.middle_crc = cpu_to_le32(crc); } else con->out_msg->footer.middle_crc = 0; - con->out_msg->footer.data_crc = 0; - dout("prepare_write_message front_crc %u data_crc %u\n", + dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); /* is there a data payload? */ - if (le32_to_cpu(m->hdr.data_len) > 0) { - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (m->pages) - con->out_msg_pos.page_pos = m->page_alignment; - else - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ - } else { + con->out_msg->footer.data_crc = 0; + if (m->hdr.data_len) + prepare_write_message_data(con); + else /* no, queue up footer too and be done */ prepare_write_message_footer(con); - } - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -630,16 +809,16 @@ static void prepare_write_ack(struct ceph_connection *con) con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -648,9 +827,9 @@ static void prepare_write_ack(struct ceph_connection *con) static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); - ceph_con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); - set_bit(WRITE_PENDING, &con->state); + con_out_kvec_reset(con); + con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -665,27 +844,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection if (!con->ops->get_authorizer) { con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_len = 0; - return NULL; } /* Can't hold the mutex while getting authorizer */ - mutex_unlock(&con->mutex); - auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); - mutex_lock(&con->mutex); if (IS_ERR(auth)) return auth; - if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) + if (con->state != CON_STATE_NEGOTIATING) return ERR_PTR(-EAGAIN); con->auth_reply_buf = auth->authorizer_reply_buf; con->auth_reply_buf_len = auth->authorizer_reply_buf_len; - - return auth; } @@ -694,12 +867,12 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection */ static void prepare_write_banner(struct ceph_connection *con) { - ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), &con->msgr->my_enc_addr); con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } static int prepare_write_connect(struct ceph_connection *con) @@ -742,14 +915,14 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.authorizer_len = auth ? cpu_to_le32(auth->authorizer_buf_len) : 0; - ceph_con_out_kvec_add(con, sizeof (con->out_connect), + con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); if (auth && auth->authorizer_buf_len) - ceph_con_out_kvec_add(con, auth->authorizer_buf_len, + con_out_kvec_add(con, auth->authorizer_buf_len, auth->authorizer_buf); con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); return 0; } @@ -797,30 +970,34 @@ out: return ret; /* done! */ } -#ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +static void out_msg_pos_next(struct ceph_connection *con, struct page *page, + size_t len, size_t sent, bool in_trail) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; - } - *iter = bio; - *seg = bio->bi_idx; -} + struct ceph_msg *msg = con->out_msg; -static void iter_bio_next(struct bio **bio_iter, int *seg) -{ - if (*bio_iter == NULL) - return; + BUG_ON(!msg); + BUG_ON(!sent); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + con->out_msg_pos.data_pos += sent; + con->out_msg_pos.page_pos += sent; + if (sent < len) + return; - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); -} + BUG_ON(sent != len); + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = false; + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); #endif +} /* * Write as much message data payload as we can. If we finish, queue @@ -837,41 +1014,36 @@ static int write_partial_msg_pages(struct ceph_connection *con) bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; - int in_trail = 0; - size_t trail_len = (msg->trail ? msg->trail->length : 0); + bool in_trail = false; + const size_t trail_len = (msg->trail ? msg->trail->length : 0); + const size_t trail_off = data_len - trail_len; dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con, msg, con->out_msg_pos.page, msg->nr_pages, con->out_msg_pos.page_pos); -#ifdef CONFIG_BLOCK - if (msg->bio && !msg->bio_iter) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; - total_max_write = data_len - trail_len - - con->out_msg_pos.data_pos; - - /* - * if we are calculating the data crc (the default), we need - * to map the page. if our pages[] has been revoked, use the - * zero page. - */ - - /* have we reached the trail part of the data? */ - if (con->out_msg_pos.data_pos >= data_len - trail_len) { - in_trail = 1; + in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; + if (!in_trail) + total_max_write = trail_off - con->out_msg_pos.data_pos; + if (in_trail) { total_max_write = data_len - con->out_msg_pos.data_pos; page = list_first_entry(&msg->trail->head, struct page, lru); - max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; } else if (msg->pagelist) { @@ -894,52 +1066,32 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (do_datacrc && !con->out_msg_pos.did_page_crc) { void *base; - u32 crc; - u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + u32 crc = le32_to_cpu(msg->footer.data_crc); char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(tmpcrc, base, len); - con->out_msg->footer.data_crc = cpu_to_le32(crc); + crc = crc32c(crc, base, len); + kunmap(page); + msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, con->out_msg_pos.page_pos + bio_offset, len, 1); - - if (do_datacrc) - kunmap(page); - if (ret <= 0) goto out; - con->out_msg_pos.data_pos += ret; - con->out_msg_pos.page_pos += ret; - if (ret == len) { - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif - } + out_msg_pos_next(con, page, len, (size_t) ret, in_trail); } dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ if (!do_datacrc) - con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - ceph_con_out_kvec_reset(con); + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); prepare_write_message_footer(con); ret = 1; out: @@ -1351,20 +1503,14 @@ static int process_banner(struct ceph_connection *con) ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } - set_bit(NEGOTIATING, &con->state); - prepare_read_connect(con); return 0; } static void fail_protocol(struct ceph_connection *con) { reset_connection(con); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - - mutex_unlock(&con->mutex); - if (con->ops->bad_proto) - con->ops->bad_proto(con); - mutex_lock(&con->mutex); + BUG_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_CLOSED; } static int process_connect(struct ceph_connection *con) @@ -1407,7 +1553,7 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1428,7 +1574,7 @@ static int process_connect(struct ceph_connection *con) ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1440,8 +1586,7 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) + if (con->state != CON_STATE_NEGOTIATING) return -EAGAIN; break; @@ -1454,7 +1599,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_reply.connect_seq)); con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1471,7 +1616,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_reply.global_seq)); - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1489,7 +1634,10 @@ static int process_connect(struct ceph_connection *con) fail_protocol(con); return -1; } - clear_bit(CONNECTING, &con->state); + + BUG_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_OPEN; + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1501,7 +1649,9 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(LOSSYTX, &con->state); + set_bit(CON_FLAG_LOSSYTX, &con->flags); + + con->delay = 0; /* reset backoff memory */ prepare_read_tag(con); break; @@ -1587,10 +1737,7 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip); - +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, @@ -1633,9 +1780,6 @@ static int read_partial_message_bio(struct ceph_connection *con, void *p; int ret, left; - if (IS_ERR(bv)) - return PTR_ERR(bv); - left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(bv->bv_len - con->in_msg_pos.page_pos)); @@ -1672,7 +1816,6 @@ static int read_partial_message(struct ceph_connection *con) int ret; unsigned int front_len, middle_len, data_len; bool do_datacrc = !con->msgr->nocrc; - int skip; u64 seq; u32 crc; @@ -1723,10 +1866,13 @@ static int read_partial_message(struct ceph_connection *con) /* allocate message? */ if (!con->in_msg) { + int skip = 0; + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); - skip = 0; - con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); + ret = ceph_con_in_msg_alloc(con, &skip); + if (ret < 0) + return ret; if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); @@ -1737,11 +1883,9 @@ static int read_partial_message(struct ceph_connection *con) con->in_seq++; return 0; } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ if (m->middle) @@ -1753,6 +1897,11 @@ static int read_partial_message(struct ceph_connection *con) else con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; + +#ifdef CONFIG_BLOCK + if (m->bio) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif } /* front */ @@ -1769,10 +1918,6 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } -#ifdef CONFIG_BLOCK - if (m->bio && !m->bio_iter) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { @@ -1783,7 +1928,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; #ifdef CONFIG_BLOCK } else if (m->bio) { - + BUG_ON(!m->bio_iter); ret = read_partial_message_bio(con, &m->bio_iter, &m->bio_seg, data_len, do_datacrc); @@ -1837,8 +1982,11 @@ static void process_message(struct ceph_connection *con) { struct ceph_msg *msg; + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; msg = con->in_msg; con->in_msg = NULL; + con->ops->put(con); /* if first message, set peer_name */ if (con->peer_name.type == 0) @@ -1858,7 +2006,6 @@ static void process_message(struct ceph_connection *con) con->ops->dispatch(con, msg); mutex_lock(&con->mutex); - prepare_read_tag(con); } @@ -1870,22 +2017,19 @@ static int try_write(struct ceph_connection *con) { int ret = 1; - dout("try_write start %p state %lu nref %d\n", con, con->state, - atomic_read(&con->nref)); + dout("try_write start %p state %lu\n", con, con->state); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); /* open the socket first? */ - if (con->sock == NULL) { - ceph_con_out_kvec_reset(con); + if (con->state == CON_STATE_PREOPEN) { + BUG_ON(con->sock); + con->state = CON_STATE_CONNECTING; + + con_out_kvec_reset(con); prepare_write_banner(con); - ret = prepare_write_connect(con); - if (ret < 0) - goto out; prepare_read_banner(con); - set_bit(CONNECTING, &con->state); - clear_bit(NEGOTIATING, &con->state); BUG_ON(con->in_msg); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1932,7 +2076,7 @@ more_kvec: } do_next: - if (!test_bit(CONNECTING, &con->state)) { + if (con->state == CON_STATE_OPEN) { /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -1942,14 +2086,15 @@ do_next: prepare_write_ack(con); goto more; } - if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { + if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, + &con->flags)) { prepare_write_keepalive(con); goto more; } } /* Nothing to do! */ - clear_bit(WRITE_PENDING, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); dout("try_write nothing else to write.\n"); ret = 0; out: @@ -1966,38 +2111,46 @@ static int try_read(struct ceph_connection *con) { int ret = -1; - if (!con->sock) - return 0; - - if (test_bit(STANDBY, &con->state)) +more: + dout("try_read start on %p state %lu\n", con, con->state); + if (con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN) return 0; - dout("try_read start on %p\n", con); + BUG_ON(!con->sock); -more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); - /* - * process_connect and process_message drop and re-take - * con->mutex. make sure we handle a racing close or reopen. - */ - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) { - ret = -EAGAIN; + if (con->state == CON_STATE_CONNECTING) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + BUG_ON(con->state != CON_STATE_CONNECTING); + con->state = CON_STATE_NEGOTIATING; + + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ goto out; } - if (test_bit(CONNECTING, &con->state)) { - if (!test_bit(NEGOTIATING, &con->state)) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - } + if (con->state == CON_STATE_NEGOTIATING) { + dout("try_read negotiating\n"); ret = read_partial_connect(con); if (ret <= 0) goto out; @@ -2007,6 +2160,8 @@ more: goto more; } + BUG_ON(con->state != CON_STATE_OPEN); + if (con->in_base_pos < 0) { /* * skipping + discarding content. @@ -2040,7 +2195,8 @@ more: prepare_read_ack(con); break; case CEPH_MSGR_TAG_CLOSE: - set_bit(CLOSED, &con->state); /* fixme */ + con_close_socket(con); + con->state = CON_STATE_CLOSED; goto out; default: goto bad_tag; @@ -2063,6 +2219,8 @@ more: if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; process_message(con); + if (con->state == CON_STATE_OPEN) + prepare_read_tag(con); goto more; } if (con->in_tag == CEPH_MSGR_TAG_ACK) { @@ -2091,12 +2249,6 @@ bad_tag: */ static void queue_con(struct ceph_connection *con) { - if (test_bit(DEAD, &con->state)) { - dout("queue_con %p ignoring: DEAD\n", - con); - return; - } - if (!con->ops->get(con)) { dout("queue_con %p ref count 0\n", con); return; @@ -2121,7 +2273,26 @@ static void con_work(struct work_struct *work) mutex_lock(&con->mutex); restart: - if (test_and_clear_bit(BACKOFF, &con->state)) { + if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) { + switch (con->state) { + case CON_STATE_CONNECTING: + con->error_msg = "connection failed"; + break; + case CON_STATE_NEGOTIATING: + con->error_msg = "negotiation failed"; + break; + case CON_STATE_OPEN: + con->error_msg = "socket closed"; + break; + default: + dout("unrecognized con state %d\n", (int)con->state); + con->error_msg = "unrecognized con state"; + BUG(); + } + goto fault; + } + + if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, round_jiffies_relative(con->delay))) { @@ -2135,35 +2306,35 @@ restart: } } - if (test_bit(STANDBY, &con->state)) { + if (con->state == CON_STATE_STANDBY) { dout("con_work %p STANDBY\n", con); goto done; } - if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ - dout("con_work CLOSED\n"); - con_close_socket(con); + if (con->state == CON_STATE_CLOSED) { + dout("con_work %p CLOSED\n", con); + BUG_ON(con->sock); goto done; } - if (test_and_clear_bit(OPENING, &con->state)) { - /* reopen w/ new peer */ + if (con->state == CON_STATE_PREOPEN) { dout("con_work OPENING\n"); - con_close_socket(con); + BUG_ON(con->sock); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state)) - goto fault; - ret = try_read(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on read"; goto fault; + } ret = try_write(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on write"; goto fault; + } done: mutex_unlock(&con->mutex); @@ -2172,7 +2343,6 @@ done_unlocked: return; fault: - mutex_unlock(&con->mutex); ceph_fault(con); /* error/fault path */ goto done_unlocked; } @@ -2183,26 +2353,31 @@ fault: * exponential backoff */ static void ceph_fault(struct ceph_connection *con) + __releases(con->mutex) { pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); - if (test_bit(LOSSYTX, &con->state)) { - dout("fault on LOSSYTX channel\n"); - goto out; - } - - mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state)) - goto out_unlock; + BUG_ON(con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN); con_close_socket(con); + if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { + dout("fault on LOSSYTX channel, marking CLOSED\n"); + con->state = CON_STATE_CLOSED; + goto out_unlock; + } + if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; + con->ops->put(con); } /* Requeue anything that hasn't been acked */ @@ -2211,12 +2386,13 @@ static void ceph_fault(struct ceph_connection *con) /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !test_bit(KEEPALIVE_PENDING, &con->state)) { + !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - clear_bit(WRITE_PENDING, &con->state); - set_bit(STANDBY, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con->state = CON_STATE_STANDBY; } else { /* retry after a delay. */ + con->state = CON_STATE_PREOPEN; if (con->delay == 0) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) @@ -2237,13 +2413,12 @@ static void ceph_fault(struct ceph_connection *con) * that when con_work restarts we schedule the * delay then. */ - set_bit(BACKOFF, &con->state); + set_bit(CON_FLAG_BACKOFF, &con->flags); } } out_unlock: mutex_unlock(&con->mutex); -out: /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. @@ -2260,18 +2435,14 @@ out: /* - * create a new messenger instance + * initialize a new messenger instance */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features) +void ceph_messenger_init(struct ceph_messenger *msgr, + struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features, + bool nocrc) { - struct ceph_messenger *msgr; - - msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); - if (msgr == NULL) - return ERR_PTR(-ENOMEM); - msgr->supported_features = supported_features; msgr->required_features = required_features; @@ -2284,30 +2455,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); + msgr->nocrc = nocrc; - dout("messenger_create %p\n", msgr); - return msgr; -} -EXPORT_SYMBOL(ceph_messenger_create); + atomic_set(&msgr->stopping, 0); -void ceph_messenger_destroy(struct ceph_messenger *msgr) -{ - dout("destroy %p\n", msgr); - kfree(msgr); - dout("destroyed messenger %p\n", msgr); + dout("%s %p\n", __func__, msgr); } -EXPORT_SYMBOL(ceph_messenger_destroy); +EXPORT_SYMBOL(ceph_messenger_init); static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (test_and_clear_bit(STANDBY, &con->state)) { - mutex_lock(&con->mutex); + if (con->state == CON_STATE_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); + con->state = CON_STATE_PREOPEN; con->connect_seq++; - WARN_ON(test_bit(WRITE_PENDING, &con->state)); - WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); - mutex_unlock(&con->mutex); + WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); + WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); } } @@ -2316,21 +2480,24 @@ static void clear_standby(struct ceph_connection *con) */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { - if (test_bit(CLOSED, &con->state)) { - dout("con_send %p closed, dropping %p\n", con, msg); - ceph_msg_put(msg); - return; - } - /* set src+dst */ msg->hdr.src = con->msgr->inst.name; - BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); - msg->needs_out_seq = true; - /* queue */ mutex_lock(&con->mutex); + + if (con->state == CON_STATE_CLOSED) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + mutex_unlock(&con->mutex); + return; + } + + BUG_ON(msg->con != NULL); + msg->con = con->ops->get(con); + BUG_ON(msg->con == NULL); + BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, @@ -2339,12 +2506,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); + + clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ - clear_standby(con); - if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) + if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -2352,24 +2520,34 @@ EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ -void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke(struct ceph_msg *msg) { + struct ceph_connection *con = msg->con; + + if (!con) + return; /* Message not in our possession */ + mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p - was on queue\n", con, msg); + dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); - ceph_msg_put(msg); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; msg->hdr.seq = 0; + + ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("con_revoke %p msg %p - was sending\n", con, msg); + dout("%s %p msg %p - was sending\n", __func__, con, msg); con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - ceph_msg_put(msg); msg->hdr.seq = 0; + + ceph_msg_put(msg); } mutex_unlock(&con->mutex); } @@ -2377,17 +2555,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) /* * Revoke a message that we may be reading data into */ -void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke_incoming(struct ceph_msg *msg) { + struct ceph_connection *con; + + BUG_ON(msg == NULL); + if (!msg->con) { + dout("%s msg %p null con\n", __func__, msg); + + return; /* Message not in our possession */ + } + + con = msg->con; mutex_lock(&con->mutex); - if (con->in_msg && con->in_msg == msg) { + if (con->in_msg == msg) { unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); /* skip rest of message */ - dout("con_revoke_pages %p msg %p revoked\n", con, msg); - con->in_base_pos = con->in_base_pos - + dout("%s %p msg %p revoked\n", __func__, con, msg); + con->in_base_pos = con->in_base_pos - sizeof(struct ceph_msg_header) - front_len - middle_len - @@ -2398,8 +2586,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; } else { - dout("con_revoke_pages %p msg %p pages %p no-op\n", - con, con->in_msg, msg); + dout("%s %p in_msg %p msg %p no-op\n", + __func__, con, con->in_msg, msg); } mutex_unlock(&con->mutex); } @@ -2410,9 +2598,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); + mutex_lock(&con->mutex); clear_standby(con); - if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && - test_and_set_bit(WRITE_PENDING, &con->state) == 0) + mutex_unlock(&con->mutex); + if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && + test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -2431,6 +2621,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, if (m == NULL) goto out; kref_init(&m->kref); + + m->con = NULL; INIT_LIST_HEAD(&m->list_head); m->hdr.tid = 0; @@ -2526,46 +2718,77 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) } /* - * Generic message allocator, for incoming messages. + * Allocate a message for receiving an incoming message on a + * connection, and save the result in con->in_msg. Uses the + * connection's private alloc_msg op if available. + * + * Returns 0 on success, or a negative error code. + * + * On success, if we set *skip = 1: + * - the next message should be skipped and ignored. + * - con->in_msg == NULL + * or if we set *skip = 0: + * - con->in_msg is non-null. + * On error (ENOMEM, EAGAIN, ...), + * - con->in_msg == NULL */ -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { + struct ceph_msg_header *hdr = &con->in_hdr; int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); - struct ceph_msg *msg = NULL; - int ret; + int ret = 0; + + BUG_ON(con->in_msg != NULL); if (con->ops->alloc_msg) { + struct ceph_msg *msg; + mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (!msg || *skip) - return NULL; + if (con->state != CON_STATE_OPEN) { + ceph_msg_put(msg); + return -EAGAIN; + } + con->in_msg = msg; + if (con->in_msg) { + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + } + if (*skip) { + con->in_msg = NULL; + return 0; + } + if (!con->in_msg) { + con->error_msg = + "error allocating memory for incoming message"; + return -ENOMEM; + } } - if (!msg) { - *skip = 0; - msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!msg) { + if (!con->in_msg) { + con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!con->in_msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return NULL; + return -ENOMEM; } - msg->page_alignment = le16_to_cpu(hdr->data_off); + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); } - memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len && !msg->middle) { - ret = ceph_alloc_middle(con, msg); + if (middle_len && !con->in_msg->middle) { + ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { - ceph_msg_put(msg); - return NULL; + ceph_msg_put(con->in_msg); + con->in_msg = NULL; } } - return msg; + return ret; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index d0649a9655be..900ea0f043fc 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); - ceph_con_revoke(monc->con, monc->m_auth); + ceph_msg_revoke(monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ - ceph_con_send(monc->con, monc->m_auth); + ceph_con_send(&monc->con, monc->m_auth); } /* @@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) static void __close_session(struct ceph_mon_client *monc) { dout("__close_session closing mon%d\n", monc->cur_mon); - ceph_con_revoke(monc->con, monc->m_auth); - ceph_con_close(monc->con); + ceph_msg_revoke(monc->m_auth); + ceph_msg_revoke_incoming(monc->m_auth_reply); + ceph_msg_revoke(monc->m_subscribe); + ceph_msg_revoke_incoming(monc->m_subscribe_ack); + ceph_con_close(&monc->con); monc->cur_mon = -1; monc->pending_auth = 0; ceph_auth_reset(monc->auth); @@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc) monc->want_next_osdmap = !!monc->want_next_osdmap; dout("open_session mon%d opening\n", monc->cur_mon); - monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; - monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); - ceph_con_open(monc->con, + ceph_con_open(&monc->con, + CEPH_ENTITY_TYPE_MON, monc->cur_mon, &monc->monmap->mon_inst[monc->cur_mon].addr); /* initiatiate authentication handshake */ @@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_revoke(monc->con, msg); - ceph_con_send(monc->con, ceph_msg_get(msg)); + ceph_msg_revoke(msg); + ceph_con_send(&monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, if (monc->hunting) { pr_info("mon%d %s session established\n", monc->cur_mon, - ceph_pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr.in_addr)); monc->hunting = false; } dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -309,6 +311,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) EXPORT_SYMBOL(ceph_monc_open_session); /* + * We require the fsid and global_id in order to initialize our + * debugfs dir. + */ +static bool have_debugfs_info(struct ceph_mon_client *monc) +{ + dout("have_debugfs_info fsid %d globalid %lld\n", + (int)monc->client->have_fsid, monc->auth->global_id); + return monc->client->have_fsid && monc->auth->global_id > 0; +} + +/* * The monitor responds with mount ack indicate mount success. The * included client ticket allows the client to talk to MDSs and OSDs. */ @@ -318,9 +331,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; void *p, *end; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); + dout("handle_monmap\n"); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -342,12 +358,22 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, if (!client->have_fsid) { client->have_fsid = true; + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(client); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } + goto out_unlocked; } out: @@ -439,6 +465,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, m = NULL; } else { dout("get_generic_reply %lld got %p\n", tid, req->reply); + *skip = 0; m = ceph_msg_get(req->reply); /* * we don't need to track the connection reading into @@ -461,7 +488,7 @@ static int do_generic_request(struct ceph_mon_client *monc, req->request->hdr.tid = cpu_to_le64(req->tid); __insert_generic_request(monc, req); monc->num_generic_requests++; - ceph_con_send(monc->con, ceph_msg_get(req->request)); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); mutex_unlock(&monc->mutex); err = wait_for_completion_interruptible(&req->completion); @@ -684,8 +711,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc) for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { req = rb_entry(p, struct ceph_mon_generic_request, node); - ceph_con_revoke(monc->con, req->request); - ceph_con_send(monc->con, ceph_msg_get(req->request)); + ceph_msg_revoke(req->request); + ceph_msg_revoke_incoming(req->reply); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); } } @@ -705,7 +733,7 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - ceph_con_keepalive(monc->con); + ceph_con_keepalive(&monc->con); __validate_auth(monc); @@ -760,19 +788,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) goto out; /* connection */ - monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); - if (!monc->con) - goto out_monmap; - ceph_con_init(monc->client->msgr, monc->con); - monc->con->private = monc; - monc->con->ops = &mon_con_ops; - /* authentication */ monc->auth = ceph_auth_init(cl->options->name, cl->options->key); if (IS_ERR(monc->auth)) { err = PTR_ERR(monc->auth); - goto out_con; + goto out_monmap; } monc->auth->want_keys = CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | @@ -801,6 +822,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) if (!monc->m_auth) goto out_auth_reply; + ceph_con_init(&monc->con, monc, &mon_con_ops, + &monc->client->msgr); + monc->cur_mon = -1; monc->hunting = true; monc->sub_renew_after = jiffies; @@ -824,8 +848,6 @@ out_subscribe_ack: ceph_msg_put(monc->m_subscribe_ack); out_auth: ceph_auth_destroy(monc->auth); -out_con: - monc->con->ops->put(monc->con); out_monmap: kfree(monc->monmap); out: @@ -841,10 +863,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc) mutex_lock(&monc->mutex); __close_session(monc); - monc->con->private = NULL; - monc->con->ops->put(monc->con); - monc->con = NULL; - mutex_unlock(&monc->mutex); /* @@ -871,8 +889,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc, { int ret; int was_auth = 0; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); if (monc->auth->ops) was_auth = monc->auth->ops->is_authenticated(monc->auth); monc->pending_auth = 0; @@ -888,14 +908,29 @@ static void handle_auth_reply(struct ceph_mon_client *monc, } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); - monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = + monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr.inst.name.num = cpu_to_le64(monc->auth->global_id); __send_subscribe(monc); __resend_generic_request(monc); } + + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } } static int __validate_auth(struct ceph_mon_client *monc) @@ -1000,6 +1035,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: m = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!m) + return NULL; /* ENOMEM--return skip == 0 */ break; } @@ -1029,7 +1066,7 @@ static void mon_fault(struct ceph_connection *con) if (!monc->hunting) pr_info("mon%d %s session lost, " "hunting for new mon\n", monc->cur_mon, - ceph_pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr.in_addr)); __close_session(monc); if (!monc->hunting) { @@ -1044,9 +1081,23 @@ out: mutex_unlock(&monc->mutex); } +/* + * We can ignore refcounting on the connection struct, as all references + * will come from the messenger workqueue, which is drained prior to + * mon_client destruction. + */ +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + return con; +} + +static void con_put(struct ceph_connection *con) +{ +} + static const struct ceph_connection_operations mon_con_ops = { - .get = ceph_con_get, - .put = ceph_con_put, + .get = con_get, + .put = con_put, .dispatch = dispatch, .fault = mon_fault, .alloc_msg = mon_alloc_msg, diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 11d5f4196a73..ddec1c10ac80 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) struct ceph_msgpool *pool = arg; struct ceph_msg *msg; - msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); + msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); if (!msg) { dout("msgpool_alloc %s failed\n", pool->name); } else { @@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg) ceph_msg_put(msg); } -int ceph_msgpool_init(struct ceph_msgpool *pool, +int ceph_msgpool_init(struct ceph_msgpool *pool, int type, int front_len, int size, bool blocking, const char *name) { dout("msgpool %s init\n", name); + pool->type = type; pool->front_len = front_len; pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); if (!pool->pool) @@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, WARN_ON(1); /* try to alloc a fresh message */ - return ceph_msg_new(0, front_len, GFP_NOFS, false); + return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); } msg = mempool_alloc(pool->pool, GFP_NOFS); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ca59e66c9787..42119c05e82c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_request) ceph_msg_put(req->r_request); if (req->r_con_filling_msg) { - dout("release_request revoking pages %p from con %p\n", + dout("%s revoking pages %p from con %p\n", __func__, req->r_pages, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, - req->r_reply); + ceph_msg_revoke_incoming(req->r_reply); req->r_con_filling_msg->ops->put(req->r_con_filling_msg); } if (req->r_reply) @@ -214,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); + rb_init_node(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); INIT_LIST_HEAD(&req->r_req_lru_item); + INIT_LIST_HEAD(&req->r_osd_item); + req->r_flags = flags; WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); @@ -243,6 +245,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } ceph_pagelist_init(req->r_trail); } + /* create request message; allow space for oid */ msg_size += MAX_OBJ_NAME_SIZE; if (snapc) @@ -256,7 +259,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, return NULL; } - msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; @@ -624,7 +626,7 @@ static void osd_reset(struct ceph_connection *con) /* * Track open sessions with osds. */ -static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; @@ -634,15 +636,13 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; + osd->o_osd = onum; INIT_LIST_HEAD(&osd->o_requests); INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); osd->o_incarnation = 1; - ceph_con_init(osdc->client->msgr, &osd->o_con); - osd->o_con.private = osd; - osd->o_con.ops = &osd_con_ops; - osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); INIT_LIST_HEAD(&osd->o_keepalive_item); return osd; @@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) static void remove_all_osds(struct ceph_osd_client *osdc) { - dout("__remove_old_osds %p\n", osdc); + dout("%s %p\n", __func__, osdc); mutex_lock(&osdc->request_mutex); while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), @@ -752,7 +752,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) ret = -EAGAIN; } else { ceph_con_close(&osd->o_con); - ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, + &osdc->osdmap->osd_addr[osd->o_osd]); osd->o_incarnation++; } return ret; @@ -853,7 +854,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, if (req->r_osd) { /* make sure the original request isn't in flight. */ - ceph_con_revoke(&req->r_osd->o_con, req->r_request); + ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); if (list_empty(&req->r_osd->o_requests) && @@ -880,7 +881,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, static void __cancel_request(struct ceph_osd_request *req) { if (req->r_sent && req->r_osd) { - ceph_con_revoke(&req->r_osd->o_con, req->r_request); + ceph_msg_revoke(req->r_request); req->r_sent = 0; } } @@ -890,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc, { dout("__register_linger_request %p\n", req); list_add_tail(&req->r_linger_item, &osdc->req_linger); - list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); + if (req->r_osd) + list_add_tail(&req->r_linger_osd, + &req->r_osd->o_linger_requests); } static void __unregister_linger_request(struct ceph_osd_client *osdc, @@ -998,18 +1001,18 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = __lookup_osd(osdc, o); if (!req->r_osd && o >= 0) { err = -ENOMEM; - req->r_osd = create_osd(osdc); + req->r_osd = create_osd(osdc, o); if (!req->r_osd) { list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; } dout("map_request osd %p is osd%d\n", req->r_osd, o); - req->r_osd->o_osd = o; - req->r_osd->o_con.peer_name.num = cpu_to_le64(o); __insert_osd(osdc, req->r_osd); - ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); + ceph_con_open(&req->r_osd->o_con, + CEPH_ENTITY_TYPE_OSD, o, + &osdc->osdmap->osd_addr[o]); } if (req->r_osd) { @@ -1304,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); + p = rb_next(p); err = __map_request(osdc, req, force_resend); if (err < 0) continue; /* error */ @@ -1313,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("%p tid %llu maps to no osd\n", req, req->r_tid); needmap++; /* request a newer map */ } else if (err > 0) { - dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - if (!req->r_linger) + if (!req->r_linger) { + dout("%p tid %llu requeued on osd%d\n", req, + req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } + if (req->r_linger && list_empty(&req->r_linger_item)) { + /* + * register as a linger so that we will + * re-submit below and get a new tid + */ + dout("%p tid %llu restart on osd%d\n", + req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + __register_linger_request(osdc, req); + __unregister_request(osdc, req); } } @@ -1391,7 +1408,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) epoch, maplen); newmap = osdmap_apply_incremental(&p, next, osdc->osdmap, - osdc->client->msgr); + &osdc->client->msgr); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -1839,11 +1856,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, + OSD_OP_FRONT_LEN, 10, true, "osd_op"); if (err < 0) goto out_mempool; - err = ceph_msgpool_init(&osdc->msgpool_op_reply, + err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, OSD_OPREPLY_FRONT_LEN, 10, true, "osd_op_reply"); if (err < 0) @@ -2019,15 +2037,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, if (!req) { *skip = 1; m = NULL; - pr_info("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); + dout("get_reply unknown tid %llu from osd%d\n", tid, + osd->o_osd); goto out; } if (req->r_con_filling_msg) { - dout("get_reply revoking msg %p from old con %p\n", + dout("%s revoking msg %p from old con %p\n", __func__, req->r_reply, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); + ceph_msg_revoke_incoming(req->r_reply); req->r_con_filling_msg->ops->put(req->r_con_filling_msg); req->r_con_filling_msg = NULL; } @@ -2080,6 +2098,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, int type = le16_to_cpu(hdr->type); int front = le32_to_cpu(hdr->front_len); + *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_WATCH_NOTIFY: diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 81e3b84a77ef..3124b71a8883 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -135,6 +135,21 @@ bad: return -EINVAL; } +static int skip_name_map(void **p, void *end) +{ + int len; + ceph_decode_32_safe(p, end, len ,bad); + while (len--) { + int strlen; + *p += sizeof(u32); + ceph_decode_32_safe(p, end, strlen, bad); + *p += strlen; +} + return 0; +bad: + return -EINVAL; +} + static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) void **p = &pbyval; void *start = pbyval; u32 magic; + u32 num_name_maps; dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + /* set tunables to default values */ + c->choose_local_tries = 2; + c->choose_local_fallback_tries = 5; + c->choose_total_tries = 19; + ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); if (magic != CRUSH_MAGIC) { @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } /* ignore trailing name maps. */ + for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { + err = skip_name_map(p, end); + if (err < 0) + goto done; + } + + /* tunables */ + ceph_decode_need(p, end, 3*sizeof(u32), done); + c->choose_local_tries = ceph_decode_32(p); + c->choose_local_fallback_tries = ceph_decode_32(p); + c->choose_total_tries = ceph_decode_32(p); + dout("crush decode tunable choose_local_tries = %d", + c->choose_local_tries); + dout("crush decode tunable choose_local_fallback_tries = %d", + c->choose_local_fallback_tries); + dout("crush decode tunable choose_total_tries = %d", + c->choose_total_tries); +done: dout("crush_decode success\n"); return c; @@ -488,15 +527,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); dout(" pool %d len %d\n", pool, len); + ceph_decode_need(p, end, len, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) { + char *name = kstrndup(*p, len, GFP_NOFS); + + if (!name) + return -ENOMEM; kfree(pi->name); - pi->name = kmalloc(len + 1, GFP_NOFS); - if (pi->name) { - memcpy(pi->name, *p, len); - pi->name[len] = '\0'; - dout(" name is %s\n", pi->name); - } + pi->name = name; + dout(" name is %s\n", pi->name); } *p += len; } @@ -666,6 +706,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); ceph_decode_copy(p, &pgid, sizeof(pgid)); n = ceph_decode_32(p); + err = -EINVAL; + if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + goto bad; ceph_decode_need(p, end, n * sizeof(u32), bad); err = -ENOMEM; pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); @@ -889,6 +932,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, (void) __remove_pg_mapping(&map->pg_temp, pgid); /* insert */ + if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) { + err = -EINVAL; + goto bad; + } pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); if (!pg) { err = -ENOMEM; diff --git a/net/core/dev.c b/net/core/dev.c index 0ebaea16632f..89e33a5d4d93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1055,6 +1055,8 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { + char *new_ifalias; + ASSERT_RTNL(); if (len >= IFALIASZ) @@ -1068,9 +1070,10 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return 0; } - dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!dev->ifalias) + new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); + if (!new_ifalias) return -ENOMEM; + dev->ifalias = new_ifalias; strlcpy(dev->ifalias, alias, len+1); return len; @@ -1172,6 +1175,7 @@ static int __dev_open(struct net_device *dev) net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; @@ -1638,6 +1642,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (ptype->af_packet_priv == NULL) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1655,8 +1672,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; @@ -2118,7 +2134,8 @@ static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) static netdev_features_t harmonize_features(struct sk_buff *skb, __be16 protocol, netdev_features_t features) { - if (!can_checksum_protocol(features, protocol)) { + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; features &= ~NETIF_F_SG; } else if (illegal_highdma(skb->dev, skb)) { @@ -2133,6 +2150,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) __be16 protocol = skb->protocol; netdev_features_t features = skb->dev->features; + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + features &= ~NETIF_F_GSO_MASK; + if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; @@ -2628,15 +2648,16 @@ void __skb_get_rxhash(struct sk_buff *skb) if (!skb_flow_dissect(skb, &keys)) return; - if (keys.ports) { - if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) - swap(keys.port16[0], keys.port16[1]); + if (keys.ports) skb->l4_rxhash = 1; - } /* get a consistent hash (same value on both flow directions) */ - if ((__force u32)keys.dst < (__force u32)keys.src) + if (((__force u32)keys.dst < (__force u32)keys.src) || + (((__force u32)keys.dst == (__force u32)keys.src) && + ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { swap(keys.dst, keys.src); + swap(keys.port16[0], keys.port16[1]); + } hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src, @@ -3155,6 +3176,23 @@ void netdev_rx_handler_unregister(struct net_device *dev) } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); +/* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs. + */ +static bool skb_pfmemalloc_protocol(struct sk_buff *skb) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_ARP): + case __constant_htons(ETH_P_IP): + case __constant_htons(ETH_P_IPV6): + case __constant_htons(ETH_P_8021Q): + return true; + default: + return false; + } +} + static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; @@ -3164,14 +3202,27 @@ static int __netif_receive_skb(struct sk_buff *skb) bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; + unsigned long pflags = current->flags; net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + current->flags |= PF_MEMALLOC; + /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) - return NET_RX_DROP; + goto out; orig_dev = skb->dev; @@ -3191,7 +3242,7 @@ another_round: if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { skb = vlan_untag(skb); if (unlikely(!skb)) - goto out; + goto unlock; } #ifdef CONFIG_NET_CLS_ACT @@ -3201,6 +3252,9 @@ another_round: } #endif + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + goto skip_taps; + list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) @@ -3209,13 +3263,18 @@ another_round: } } +skip_taps: #ifdef CONFIG_NET_CLS_ACT skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) - goto out; + goto unlock; ncls: #endif + if (sk_memalloc_socks() && skb_pfmemalloc(skb) + && !skb_pfmemalloc_protocol(skb)) + goto drop; + rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { @@ -3225,7 +3284,7 @@ ncls: if (vlan_do_receive(&skb, !rx_handler)) goto another_round; else if (unlikely(!skb)) - goto out; + goto unlock; } if (rx_handler) { @@ -3235,7 +3294,7 @@ ncls: } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: - goto out; + goto unlock; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: @@ -3264,10 +3323,11 @@ ncls: if (pt_prev) { if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) - ret = -ENOMEM; + goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { +drop: atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining @@ -3276,8 +3336,10 @@ ncls: ret = NET_RX_DROP; } -out: +unlock: rcu_read_unlock(); +out: + tsk_restore_flags(current, pflags, PF_MEMALLOC); return ret; } @@ -4801,6 +4863,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) err = ops->ndo_set_mac_address(dev, sa); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + add_device_randomness(dev->dev_addr, dev->addr_len); return err; } EXPORT_SYMBOL(dev_set_mac_address); @@ -5579,6 +5642,7 @@ int register_netdevice(struct net_device *dev) dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); @@ -5682,6 +5746,7 @@ EXPORT_SYMBOL(netdev_refcnt_read); /** * netdev_wait_allrefs - wait until all references are gone. + * @dev: target net_device * * This is called when unregistering network devices. * @@ -5942,6 +6007,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d29414..56d63612e1e4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -149,7 +149,15 @@ int dst_discard(struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard); -const u32 dst_default_metrics[RTAX_MAX]; +const u32 dst_default_metrics[RTAX_MAX + 1] = { + /* This initializer is needed to force linker to place this variable + * into const section. Otherwise it might end into bss section. + * We really want to avoid false sharing on this variable, and catch + * any writes on it. + */ + [RTAX_MAX] = 0xdeadbeef, +}; + void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) diff --git a/net/core/filter.c b/net/core/filter.c index d4ce2dc712e3..907efd27ec77 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) int err; struct sk_filter *filter; + /* + * If the skb was allocated from pfmemalloc reserves, only + * allow SOCK_MEMALLOC sockets to use it as this socket is + * helping free memory + */ + if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) + return -ENOMEM; + err = security_sock_rcv_skb(sk, skb); if (err) return err; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b4c90e42b443..e4ba3e70c174 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -26,6 +26,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> @@ -54,7 +55,7 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void arp_reply(struct sk_buff *skb); +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -170,7 +171,8 @@ static void poll_napi(struct net_device *dev) list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(dev->npinfo, napi, budget); + budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), + napi, budget); spin_unlock(&napi->poll_lock); if (!budget) @@ -185,13 +187,14 @@ static void service_arp_queue(struct netpoll_info *npi) struct sk_buff *skb; while ((skb = skb_dequeue(&npi->arp_tx))) - arp_reply(skb); + netpoll_arp_reply(skb, npi); } } static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; + struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); if (!dev || !netif_running(dev)) return; @@ -206,17 +209,18 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); if (dev->flags & IFF_SLAVE) { - if (dev->npinfo) { + if (ni) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; - while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); + while ((skb = skb_dequeue(&ni->arp_tx))) { skb->dev = bond_dev; - skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + skb_queue_tail(&bond_ni->arp_tx, skb); } } } - service_arp_queue(dev->npinfo); + service_arp_queue(ni); zap_completion_queue(); } @@ -302,6 +306,7 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } +/* call with IRQ disabled */ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { @@ -309,8 +314,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, unsigned long tries; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ - struct netpoll_info *npinfo = np->dev->npinfo; + struct netpoll_info *npinfo; + + WARN_ON_ONCE(!irqs_disabled()); + npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { __kfree_skb(skb); return; @@ -319,16 +327,22 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - unsigned long flags; txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (__netif_tx_trylock(txq)) { if (!netif_xmit_stopped(txq)) { + if (vlan_tx_tag_present(skb) && + !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + break; + skb->vlan_tci = 0; + } + status = ops->ndo_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); @@ -347,10 +361,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - local_irq_restore(flags); } if (status != NETDEV_TX_OK) { @@ -423,9 +436,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void arp_reply(struct sk_buff *skb) +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) { - struct netpoll_info *npinfo = skb->dev->npinfo; struct arphdr *arp; unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; @@ -543,13 +555,12 @@ static void arp_reply(struct sk_buff *skb) spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -int __netpoll_rx(struct sk_buff *skb) +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) { int proto, len, ulen; int hits = 0; const struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npinfo = skb->dev->npinfo; struct netpoll *np, *tmp; if (list_empty(&npinfo->rx_np)) @@ -565,6 +576,12 @@ int __netpoll_rx(struct sk_buff *skb) return 1; } + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + proto = ntohs(eth_hdr(skb)->h_proto); if (proto != ETH_P_IP) goto out; @@ -715,7 +732,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) { struct netpoll_info *npinfo; const struct net_device_ops *ops; @@ -734,7 +751,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + npinfo = kmalloc(sizeof(*npinfo), gfp); if (!npinfo) { err = -ENOMEM; goto out; @@ -752,7 +769,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); if (err) goto free_npinfo; } @@ -857,7 +874,7 @@ int netpoll_setup(struct netpoll *np) refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np, ndev); + err = __netpoll_setup(np, ndev, GFP_KERNEL); rtnl_unlock(); if (err) @@ -878,6 +895,24 @@ static int __init netpoll_init(void) } core_initcall(netpoll_init); +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ + struct netpoll_info *npinfo = + container_of(rcu_head, struct netpoll_info, rcu); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + + /* we can't call cancel_delayed_work_sync here, as we are in softirq */ + cancel_delayed_work(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + /* now cancel it again */ + cancel_delayed_work(&npinfo->tx_work); + kfree(npinfo); +} + void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -903,20 +938,24 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); - /* avoid racing with NAPI reading npinfo */ - synchronize_rcu_bh(); +static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +{ + struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); - skb_queue_purge(&npinfo->arp_tx); - skb_queue_purge(&npinfo->txq); - cancel_delayed_work_sync(&npinfo->tx_work); + __netpoll_cleanup(np); + kfree(np); +} - /* clean after last, unfinished work */ - __skb_queue_purge(&npinfo->txq); - kfree(npinfo); - } +void __netpoll_free_rcu(struct netpoll *np) +{ + call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); +EXPORT_SYMBOL_GPL(__netpoll_free_rcu); void netpoll_cleanup(struct netpoll *np) { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index ed0c0431fcd8..c75e3f9d060f 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -101,12 +101,10 @@ static int write_update_netdev_table(struct net_device *dev) u32 max_len; struct netprio_map *map; - rtnl_lock(); max_len = atomic_read(&max_prioidx) + 1; map = rtnl_dereference(dev->priomap); if (!map || map->priomap_len < max_len) ret = extend_netdev_table(dev, max_len); - rtnl_unlock(); return ret; } @@ -256,17 +254,17 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, if (!dev) goto out_free_devname; + rtnl_lock(); ret = write_update_netdev_table(dev); if (ret < 0) goto out_put_dev; - rcu_read_lock(); - map = rcu_dereference(dev->priomap); + map = rtnl_dereference(dev->priomap); if (map) map->priomap[prioidx] = priority; - rcu_read_unlock(); out_put_dev: + rtnl_unlock(); dev_put(dev); out_free_devname: @@ -277,12 +275,6 @@ out_free_devname: void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *p; - char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); - - if (!tmp) { - pr_warn("Unable to attach cgrp due to alloc failure!\n"); - return; - } cgroup_taskset_for_each(p, cgrp, tset) { unsigned int fd; @@ -296,32 +288,24 @@ void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) continue; } - rcu_read_lock(); + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (fd = 0; fd < fdt->max_fds; fd++) { - char *path; struct file *file; struct socket *sock; - unsigned long s; - int rv, err = 0; + int err; file = fcheck_files(files, fd); if (!file) continue; - path = d_path(&file->f_path, tmp, PAGE_SIZE); - rv = sscanf(path, "socket:[%lu]", &s); - if (rv <= 0) - continue; - sock = sock_from_file(file, &err); - if (!err) + if (sock) sock_update_netprioidx(sock->sk, p); } - rcu_read_unlock(); + spin_unlock(&files->file_lock); task_unlock(p); } - kfree(tmp); } static struct cftype ss_files[] = { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index cce9e53528b1..148e73d2c451 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2721,7 +2721,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - pkt_dev->pkt_overhead; - if (datalen < sizeof(struct pktgen_hdr)) + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) datalen = sizeof(struct pktgen_hdr); udph->source = htons(pkt_dev->cur_udp_src); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 334b930e0de3..2c5a0a06c4ce 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -625,9 +625,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_id = id, }; - if (expires) - ci.rta_expires = jiffies_to_clock_t(expires); + if (expires) { + unsigned long clock; + clock = jiffies_to_clock_t(abs(expires)); + clock = min_t(unsigned long, clock, INT_MAX); + ci.rta_expires = (expires > 0) ? clock : -clock; + } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); @@ -659,6 +663,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } } +static unsigned int rtnl_dev_get_flags(const struct net_device *dev) +{ + return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | + (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); +} + static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { @@ -667,7 +677,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | - (dev->flags & ~ifm->ifi_change); + (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } @@ -1371,6 +1381,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, goto errout; send_addr_notify = 1; modified = 1; + add_device_randomness(dev->dev_addr, dev->addr_len); } if (tb[IFLA_MTU]) { diff --git a/net/core/scm.c b/net/core/scm.c index 8f6ccfd68ef4..040cebeed45b 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -265,6 +265,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) { + struct socket *sock; int new_fd; err = security_file_receive(fp[i]); if (err) @@ -281,6 +282,9 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } /* Bump the usage count and install the file. */ get_file(fp[i]); + sock = sock_from_file(fp[i], &err); + if (sock) + sock_update_netprioidx(sock->sk, current); fd_install(new_fd, fp[i]); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368f65c15e4f..e33ebae519c8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) BUG(); } + +/* + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells + * the caller if emergency pfmemalloc reserves are being used. If it is and + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves + * may be used. Otherwise, the packet data may be discarded until enough + * memory is free + */ +#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ + __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) +void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip, + bool *pfmemalloc) +{ + void *obj; + bool ret_pfmemalloc = false; + + /* + * Try a regular allocation, when that fails and we're not entitled + * to the reserves, fail. + */ + obj = kmalloc_node_track_caller(size, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmalloc_node_track_caller(size, flags, node); + +out: + if (pfmemalloc) + *pfmemalloc = ret_pfmemalloc; + + return obj; +} + /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. @@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask - * @fclone: allocate from fclone cache instead of head cache - * and allocate a cloned (child) skb + * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache + * instead of head cache and allocate a cloned (child) skb. + * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case the data is required for writeback * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a @@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int fclone, int node) + int flags, int node) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; + bool pfmemalloc; - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + cache = (flags & SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; + + if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) + gfp_mask |= __GFP_MEMALLOC; /* Get the HEAD */ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); @@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, */ size = SKB_DATA_ALIGN(size); size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_node_track_caller(size, gfp_mask, node); + data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); if (!data) goto nodata; /* kmalloc(size) might give us more room than requested. @@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, memset(skb, 0, offsetof(struct sk_buff, tail)); /* Account for allocated memory : skb + skb->head */ skb->truesize = SKB_TRUESIZE(size); + skb->pfmemalloc = pfmemalloc; atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_set(&shinfo->dataref, 1); kmemcheck_annotate_variable(shinfo->destructor_arg); - if (fclone) { + if (flags & SKB_ALLOC_FCLONE) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_set(fclone_ref, 1); child->fclone = SKB_FCLONE_UNAVAILABLE; + child->pfmemalloc = pfmemalloc; } out: return skb; @@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); #define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) -/** - * netdev_alloc_frag - allocate a page fragment - * @fragsz: fragment size - * - * Allocates a frag from a page for receive buffer. - * Uses GFP_ATOMIC allocations. - */ -void *netdev_alloc_frag(unsigned int fragsz) +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct netdev_alloc_cache *nc; void *data = NULL; @@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz) nc = &__get_cpu_var(netdev_alloc_cache); if (unlikely(!nc->page)) { refill: - nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); + nc->page = alloc_page(gfp_mask); if (unlikely(!nc->page)) goto end; recycle: @@ -343,6 +382,18 @@ end: local_irq_restore(flags); return data; } + +/** + * netdev_alloc_frag - allocate a page fragment + * @fragsz: fragment size + * + * Allocates a frag from a page for receive buffer. + * Uses GFP_ATOMIC allocations. + */ +void *netdev_alloc_frag(unsigned int fragsz) +{ + return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} EXPORT_SYMBOL(netdev_alloc_frag); /** @@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data = netdev_alloc_frag(fragsz); + void *data; + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + data = __netdev_alloc_frag(fragsz, gfp_mask); if (likely(data)) { skb = build_skb(data, fragsz); @@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, put_page(virt_to_head_page(data)); } } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, + SKB_ALLOC_RX, NUMA_NO_NODE); } if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); @@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #if IS_ENABLED(CONFIG_IP_VS) new->ipvs_property = old->ipvs_property; #endif + new->pfmemalloc = old->pfmemalloc; new->protocol = old->protocol; new->mark = old->mark; new->skb_iif = old->skb_iif; @@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->fclone = SKB_FCLONE_CLONE; atomic_inc(fclone_ref); } else { + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; @@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } +static inline int skb_alloc_rx_flag(const struct sk_buff *skb) +{ + if (skb_pfmemalloc(skb)) + return SKB_ALLOC_RX; + return 0; +} + /** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy @@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { int headerlen = skb_headroom(skb); unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = alloc_skb(size, gfp_mask); + struct sk_buff *n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { unsigned int size = skb_headlen(skb) + headroom; - struct sk_buff *n = alloc_skb(size, gfp_mask); + struct sk_buff *n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) goto out; @@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, size = SKB_DATA_ALIGN(size); - data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask); + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(ksize(data)); @@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask); + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; int off; @@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { - nskb = alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC); + nskb = __alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (unlikely(!nskb)) goto err; @@ -3428,7 +3502,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (!skb_cloned(from)) skb_shinfo(from)->nr_frags = 0; - /* if the skb is cloned this does nothing since we set nr_frags to 0 */ + /* if the skb is not cloned this does nothing + * since we set nr_frags to 0. + */ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) skb_frag_ref(from, i); diff --git a/net/core/sock.c b/net/core/sock.c index 2676a88f533e..305792076121 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -142,7 +142,7 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { struct proto *proto; @@ -271,6 +271,61 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(memalloc_socks); + +/** + * sk_set_memalloc - sets %SOCK_MEMALLOC + * @sk: socket to set it on + * + * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. + * It's the responsibility of the admin to adjust min_free_kbytes + * to meet the requirements + */ +void sk_set_memalloc(struct sock *sk) +{ + sock_set_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation |= __GFP_MEMALLOC; + static_key_slow_inc(&memalloc_socks); +} +EXPORT_SYMBOL_GPL(sk_set_memalloc); + +void sk_clear_memalloc(struct sock *sk) +{ + sock_reset_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation &= ~__GFP_MEMALLOC; + static_key_slow_dec(&memalloc_socks); + + /* + * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward + * progress of swapping. However, if SOCK_MEMALLOC is cleared while + * it has rmem allocations there is a risk that the user of the + * socket cannot make forward progress due to exceeding the rmem + * limits. By rights, sk_clear_memalloc() should only be called + * on sockets being torn down but warn and reset the accounting if + * that assumption breaks. + */ + if (WARN_ON(sk->sk_forward_alloc)) + sk_mem_reclaim(sk); +} +EXPORT_SYMBOL_GPL(sk_clear_memalloc); + +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int ret; + unsigned long pflags = current->flags; + + /* these should have been dropped before queueing */ + BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); + + current->flags |= PF_MEMALLOC; + ret = sk->sk_backlog_rcv(sk, skb); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + + return ret; +} +EXPORT_SYMBOL(__sk_backlog_rcv); + #if defined(CONFIG_CGROUPS) #if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; @@ -353,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (err) return err; - if (!sk_rmem_schedule(sk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } @@ -1403,6 +1458,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; + sk->sk_gso_max_segs = dst->dev->gso_max_segs; } } } @@ -1467,7 +1523,14 @@ EXPORT_SYMBOL(sock_rfree); void sock_edemux(struct sk_buff *skb) { - sock_put(skb->sk); + struct sock *sk = skb->sk; + +#ifdef CONFIG_INET + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else +#endif + sock_put(sk); } EXPORT_SYMBOL(sock_edemux); diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 75c3582a7678..fb85d371a8de 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -246,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; @@ -257,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index d65e98798eca..119c04317d48 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -535,6 +535,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) return -EINVAL; + memset(&tfrc, 0, sizeof(tfrc)); tfrc.tfrctx_x = hc->tx_x; tfrc.tfrctx_x_recv = hc->tx_x_recv; tfrc.tfrctx_x_calc = hc->tx_x_calc; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ae2ccf2890e4..15ca63ec604e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o +obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a0124eb7dbea..47800459e4cb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input(skb, tip, sip, 0, dev) == 0) { + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; @@ -1225,7 +1225,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); break; default: break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 44bf82e3aef7..e12fad773852 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -725,7 +725,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; case SIOCSIFFLAGS: - ret = -EACCES; + ret = -EPERM; if (!capable(CAP_NET_ADMIN)) goto out; break; @@ -733,7 +733,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ - ret = -EACCES; + ret = -EPERM; if (!capable(CAP_NET_ADMIN)) goto out; ret = -EINVAL; @@ -1503,7 +1503,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) - rt_cache_flush(net, 0); + rt_cache_flush(net); } return ret; @@ -1537,7 +1537,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, dev_disable_lro(idev->dev); } rtnl_unlock(); - rt_cache_flush(net, 0); + rt_cache_flush(net); } } @@ -1554,7 +1554,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(net, 0); + rt_cache_flush(net); return ret; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8732cc7920ed..8e2b475da9fa 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -148,7 +148,7 @@ static void fib_flush(struct net *net) } if (flushed) - rt_cache_flush(net, -1); + rt_cache_flush(net); } /* @@ -999,11 +999,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force, int delay) +static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), delay); + rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } @@ -1020,7 +1020,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1029,9 +1029,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1, 0); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); } break; } @@ -1045,7 +1045,8 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2, -1); + fib_disable_ip(dev, 2); + rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1061,14 +1062,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0, 0); + fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); break; case NETDEV_UNREGISTER_BATCH: break; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a83d74e498d2..274309d3aded 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -259,7 +259,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(ops->fro_net, -1); + rt_cache_flush(ops->fro_net); } static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e55171f184f9..da80dc14cc76 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; +static void rt_fibinfo_free(struct rtable __rcu **rtp) +{ + struct rtable *rt = rcu_dereference_protected(*rtp, 1); + + if (!rt) + return; + + /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); + * because we waited an RCU grace period before calling + * free_fib_info_rcu() + */ + + dst_free(&rt->dst); +} + static void free_nh_exceptions(struct fib_nh *nh) { struct fnhe_hash_bucket *hash = nh->nh_exceptions; @@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh) struct fib_nh_exception *next; next = rcu_dereference_protected(fnhe->fnhe_next, 1); + + rt_fibinfo_free(&fnhe->fnhe_rth); + kfree(fnhe); fnhe = next; @@ -161,6 +179,23 @@ static void free_nh_exceptions(struct fib_nh *nh) kfree(hash); } +static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) +{ + int cpu; + + if (!rtp) + return; + + for_each_possible_cpu(cpu) { + struct rtable *rt; + + rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); + if (rt) + dst_free(&rt->dst); + } + free_percpu(rtp); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -171,10 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - if (nexthop_nh->nh_rth_output) - dst_release(&nexthop_nh->nh_rth_output->dst); - if (nexthop_nh->nh_rth_input) - dst_release(&nexthop_nh->nh_rth_input->dst); + rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); + rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); @@ -804,6 +837,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_nhs = nhs; change_nexthops(fi) { nexthop_nh->nh_parent = fi; + nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); } endfor_nexthops(fi) if (cfg->fc_mx) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 18cbc15b20d5..d1b93595b4a7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -159,7 +159,6 @@ struct trie { #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); @@ -368,7 +367,7 @@ static void __leaf_free_rcu(struct rcu_head *head) static inline void free_leaf(struct leaf *l) { - call_rcu_bh(&l->rcu, __leaf_free_rcu); + call_rcu(&l->rcu, __leaf_free_rcu); } static inline void free_leaf_info(struct leaf_info *leaf) @@ -473,7 +472,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct rt_trie_node) << bits); + sizeof(struct rt_trie_node *) << bits); return tn; } @@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node * return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, +static inline void put_child(struct tnode *tn, int i, struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); @@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct rt_trie_node *) left); - put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); + put_child(tn, 2*i, (struct rt_trie_node *) left); + put_child(tn, 2*i+1, (struct rt_trie_node *) right); } } @@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1) == 0) - put_child(t, tn, 2*i, node); + put_child(tn, 2*i, node); else - put_child(t, tn, 2*i+1, node); + put_child(tn, 2*i+1, node); continue; } @@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); - put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); + put_child(tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) */ left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + put_child(tn, 2*i, NULL); BUG_ON(!left); right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + put_child(tn, 2*i+1, NULL); BUG_ON(!right); size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, rtnl_dereference(inode->child[j])); - put_child(t, right, j, rtnl_dereference(inode->child[j + size])); + put_child(left, j, rtnl_dereference(inode->child[j])); + put_child(right, j, rtnl_dereference(inode->child[j + size])); } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + put_child(tn, 2*i, resize(t, left)); + put_child(tn, 2*i+1, resize(t, right)); tnode_free_safe(inode); } @@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct rt_trie_node *)newn); + put_child(tn, i/2, (struct rt_trie_node *)newn); } } @@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (left == NULL) { if (right == NULL) /* Both are empty */ continue; - put_child(t, tn, i/2, right); + put_child(tn, i/2, right); continue; } if (right == NULL) { - put_child(t, tn, i/2, left); + put_child(tn, i/2, left); continue; } /* Two nonempty children */ newBinNode = (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); + put_child(tn, i/2, NULL); + put_child(newBinNode, 0, left); + put_child(newBinNode, 1, right); + put_child(tn, i/2, resize(t, newBinNode)); } tnode_free_safe(oldtnode); return tn; @@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, tp, cindex, (struct rt_trie_node *)l); + put_child(tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct rt_trie_node *)l); - put_child(t, tn, 1-missbit, n); + put_child(tn, missbit, (struct rt_trie_node *)l); + put_child(tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, tp, cindex, (struct rt_trie_node *)tn); + put_child(tp, cindex, (struct rt_trie_node *)tn); } else { rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; @@ -1287,7 +1286,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1334,7 +1333,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) if (tp) { t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); - put_child(t, tp, cindex, NULL); + put_child(tp, cindex, NULL); trie_rebalance(t, tp); } else RCU_INIT_POINTER(t->trie, NULL); @@ -1709,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index db0cf17c00f7..7f75f21d7b83 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -404,12 +404,15 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *newinet = inet_sk(newsk); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; fl4 = &newinet->cork.fl.u.ip4; + + rcu_read_lock(); + opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -421,11 +424,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7ad88e5e7110..8d07c973409c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg) /* skb dst is stale, drop it, and perform route lookup again */ skb_dst_drop(head); iph = ip_hdr(head); - err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4ebc6feee250..f1395a6fb35f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,6 +314,7 @@ drop: } int sysctl_ip_early_demux __read_mostly = 1; +EXPORT_SYMBOL(sysctl_ip_early_demux); static int ip_rcv_finish(struct sk_buff *skb) { @@ -324,11 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct net_protocol *ipprot; int protocol = iph->protocol; - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) + if (ipprot && ipprot->early_demux) { ipprot->early_demux(skb); - rcu_read_unlock(); + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } } /* @@ -336,8 +338,8 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_dst(skb)) { - int err = ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ba39a52d18c1..c196d749daf2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -197,7 +197,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); - if (neigh) { + if (!IS_ERR(neigh)) { int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); @@ -1338,10 +1338,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ihl = 5; iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); + ip_select_ident(iph, &rt->dst, sk); if (opt) { iph->ihl += opt->optlen>>2; @@ -1366,9 +1366,8 @@ out: return skb; } -int ip_send_skb(struct sk_buff *skb) +int ip_send_skb(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); int err; err = ip_local_out(skb); @@ -1391,7 +1390,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) return 0; /* Netfilter gets whole the not fragmented skb. */ - return ip_send_skb(skb); + return ip_send_skb(sock_net(sk), skb); } /* @@ -1536,6 +1535,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; + skb_orphan(nskb); skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8eec8f4a0536..ebdf06f938bf 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ipmr_new_table(struct net *net, u32 id); +static void ipmr_free_table(struct mr_table *mrt); + static int ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local); @@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); +static void mroute_clean_tables(struct mr_table *mrt); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - kfree(mrt); + ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } @@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { - kfree(net->ipv4.mrt); + ipmr_free_table(net->ipv4.mrt); } #endif @@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) return mrt; } +static void ipmr_free_table(struct mr_table *mrt) +{ + del_timer_sync(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ea4a23813d26..9c87cde28ff8 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, hdr, NULL, &matchoff, &matchlen, &addr, &port) > 0) { - unsigned int matchend, poff, plen, buflen, n; + unsigned int olen, matchend, poff, plen, buflen, n; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; /* We're only interested in headers related to this @@ -163,17 +163,18 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, goto next; } + olen = *datalen; if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; - matchend = matchoff + matchlen; + matchend = matchoff + matchlen + *datalen - olen; /* The maddr= parameter (RFC 2361) specifies where to send * the reply. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "maddr=", &poff, &plen, - &addr) > 0 && + &addr, true) > 0 && addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", @@ -187,7 +188,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * from which the server received the request. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "received=", &poff, &plen, - &addr) > 0 && + &addr, false) > 0 && addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", @@ -501,7 +502,10 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); port = 0; break; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6bcb8fc71cbc..fd9af60397b5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,7 +70,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -80,7 +79,6 @@ #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> -#include <linux/workqueue.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> @@ -88,11 +86,9 @@ #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> -#include <linux/prefetch.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -147,6 +143,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) @@ -170,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = { .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, + .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, @@ -204,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) -static inline int rt_genid(struct net *net) -{ - return atomic_read(&net->ipv4.rt_genid); -} - #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { @@ -444,32 +437,14 @@ static inline int ip_rt_proc_init(void) } #endif /* CONFIG_PROC_FS */ -static inline int rt_is_expired(struct rtable *rth) +static inline bool rt_is_expired(const struct rtable *rth) { return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } -/* - * Perturbation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(struct net *net) +void rt_cache_flush(struct net *net) { - unsigned char shuffle; - - get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &net->ipv4.rt_genid); -} - -/* - * delay < 0 : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(struct net *net, int delay) -{ - rt_cache_invalidate(net); + rt_genid_bump(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -587,11 +562,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, build_sk_flow_key(fl4, sk); } -static DEFINE_SEQLOCK(fnhe_seqlock); +static inline void rt_free(struct rtable *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + +static DEFINE_SPINLOCK(fnhe_lock); static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception *fnhe, *oldest; + struct rtable *orig; oldest = rcu_dereference(hash->chain); for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; @@ -599,6 +580,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) oldest = fnhe; } + orig = rcu_dereference(oldest->fnhe_rth); + if (orig) { + RCU_INIT_POINTER(oldest->fnhe_rth, NULL); + rt_free(orig); + } return oldest; } @@ -620,7 +606,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, int depth; u32 hval = fnhe_hashfun(daddr); - write_seqlock_bh(&fnhe_seqlock); + spin_lock_bh(&fnhe_lock); hash = nh->nh_exceptions; if (!hash) { @@ -667,7 +653,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_stamp = jiffies; out_unlock: - write_sequnlock_bh(&fnhe_seqlock); + spin_unlock_bh(&fnhe_lock); return; } @@ -925,12 +911,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } + rcu_read_unlock(); return mtu; } @@ -947,7 +935,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, dst->obsolete = DST_OBSOLETE_KILL; } else { rt->rt_pmtu = mtu; - dst_set_expires(&rt->dst, ip_rt_mtu_expires); + rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); } } @@ -1164,67 +1152,126 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) return NULL; } -static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr) { - __be32 fnhe_daddr, gw; - unsigned long expires; - unsigned int seq; - u32 pmtu; - -restart: - seq = read_seqbegin(&fnhe_seqlock); - fnhe_daddr = fnhe->fnhe_daddr; - gw = fnhe->fnhe_gw; - pmtu = fnhe->fnhe_pmtu; - expires = fnhe->fnhe_expires; - if (read_seqretry(&fnhe_seqlock, seq)) - goto restart; - - if (daddr != fnhe_daddr) - return; + bool ret = false; + + spin_lock_bh(&fnhe_lock); - if (pmtu) { - unsigned long diff = expires - jiffies; + if (daddr == fnhe->fnhe_daddr) { + struct rtable *orig; - if (time_before(jiffies, expires)) { - rt->rt_pmtu = pmtu; - dst_set_expires(&rt->dst, diff); + if (fnhe->fnhe_pmtu) { + unsigned long expires = fnhe->fnhe_expires; + unsigned long diff = expires - jiffies; + + if (time_before(jiffies, expires)) { + rt->rt_pmtu = fnhe->fnhe_pmtu; + dst_set_expires(&rt->dst, diff); + } } + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; + } + + orig = rcu_dereference(fnhe->fnhe_rth); + rcu_assign_pointer(fnhe->fnhe_rth, rt); + if (orig) + rt_free(orig); + + fnhe->fnhe_stamp = jiffies; + ret = true; + } else { + /* Routes we intend to cache in nexthop exception have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; } - if (gw) { - rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = gw; - } - fnhe->fnhe_stamp = jiffies; -} + spin_unlock_bh(&fnhe_lock); -static inline void rt_release_rcu(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_release(dst); + return ret; } -static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) { - struct rtable *orig, *prev, **p = &nh->nh_rth_output; - - if (rt_is_input_route(rt)) - p = &nh->nh_rth_input; + struct rtable *orig, *prev, **p; + bool ret = true; + if (rt_is_input_route(rt)) { + p = (struct rtable **)&nh->nh_rth_input; + } else { + if (!nh->nh_pcpu_rth_output) + goto nocache; + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } orig = *p; prev = cmpxchg(p, orig, rt); if (prev == orig) { - dst_clone(&rt->dst); if (orig) - call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); + rt_free(orig); + } else { + /* Routes we intend to cache in the FIB nexthop have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ +nocache: + rt->dst.flags |= DST_NOCACHE; + ret = false; } + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); } -static bool rt_cache_valid(struct rtable *rt) +static void ipv4_dst_destroy(struct dst_entry *dst) { - return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); + struct rtable *rt = (struct rtable *) dst; + + if (!list_empty(&rt->rt_uncached)) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); + } + spin_unlock_bh(&rt_uncached_lock); + } +} + +static bool rt_cache_valid(const struct rtable *rt) +{ + return rt && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, @@ -1232,20 +1279,24 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag) { + bool cached = false; + if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) rt->rt_gateway = nh->nh_gw; - if (unlikely(fnhe)) - rt_bind_exception(rt, fnhe, daddr); dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (!(rt->dst.flags & DST_HOST)) - rt_cache_route(nh, rt); + if (unlikely(fnhe)) + cached = rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_cache_route(nh, rt); } + if (unlikely(!cached)) + rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1259,7 +1310,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm, bool will_cache) { return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : DST_HOST) | DST_NOCACHE | + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -1312,6 +1363,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1364,8 +1416,7 @@ static void ip_handle_martian_source(struct net_device *dev, static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) + __be32 daddr, __be32 saddr, u32 tos) { struct rtable *rth; int err; @@ -1414,9 +1465,9 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = false; if (res->fi) { if (!itag) { - rth = FIB_RES_NH(*res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); + skb_dst_set_noref(skb, &rth->dst); goto out; } do_cache = true; @@ -1438,13 +1489,14 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); rth->dst.input = ip_forward; rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); + skb_dst_set(skb, &rth->dst); out: - *result = rth; err = 0; cleanup: return err; @@ -1456,21 +1508,13 @@ static int ip_mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { - struct rtable *rth = NULL; - int err; - #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) fib_select_multipath(res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); - if (err) - return err; - - skb_dst_set(skb, &rth->dst); - return 0; + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } /* @@ -1584,10 +1628,11 @@ local_input: do_cache = false; if (res.fi) { if (!itag) { - rth = FIB_RES_NH(res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); - goto set_and_out; + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; } do_cache = true; } @@ -1611,6 +1656,7 @@ local_input: rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; @@ -1618,7 +1664,6 @@ local_input: } if (do_cache) rt_cache_route(&FIB_RES_NH(res), rth); -set_and_out: skb_dst_set(skb, &rth->dst); err = 0; goto out; @@ -1656,8 +1701,8 @@ martian_source_keep_err: goto out; } -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { int res; @@ -1700,7 +1745,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_unlock(); return res; } -EXPORT_SYMBOL(ip_route_input); +EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, @@ -1750,19 +1795,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; if (fi) { + struct rtable __rcu **prth; + fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); - if (!fnhe) { - rth = FIB_RES_NH(*res).nh_rth_output; - if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); - return rth; - } + if (fnhe) + prth = &fnhe->fnhe_rth; + else + prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; } } rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), - fi && !fnhe); + fi); if (!rth) return ERR_PTR(-ENOBUFS); @@ -1775,6 +1824,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); @@ -1957,7 +2007,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; - res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } @@ -2054,6 +2103,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; + INIT_LIST_HEAD(&rt->rt_uncached); + dst_free(new); } @@ -2271,7 +2322,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(dev_net(in_dev->dev), 0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL @@ -2280,16 +2331,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, size_t *lenp, loff_t *ppos) { if (write) { - int flush_delay; - ctl_table ctl; - struct net *net; - - memcpy(&ctl, __ctl, sizeof(ctl)); - ctl.data = &flush_delay; - proc_dointvec(&ctl, write, buffer, lenp, ppos); - - net = (struct net *)__ctl->extra1; - rt_cache_flush(net, flush_delay); + rt_cache_flush((struct net *)__ctl->extra1); return 0; } @@ -2459,8 +2501,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - get_random_bytes(&net->ipv4.rt_genid, - sizeof(net->ipv4.rt_genid)); + atomic_set(&net->rt_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5840c3255721..1b5ce96707a3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, int ret; unsigned long vec[3]; struct net *net = current->nsproxy->net_ns; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM struct mem_cgroup *memcg; #endif @@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, if (ret) return ret; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM rcu_read_lock(); memcg = mem_cgroup_from_task(current); @@ -784,13 +784,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .procname = "rt_cache_rebuild_count", - .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ping_group_range", .data = &init_net.ipv4.sysctl_ping_group_range, .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), @@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[5].data = &net->ipv4.sysctl_icmp_ratemask; table[6].data = - &net->ipv4.sysctl_rt_cache_rebuild_count; - table[7].data = &net->ipv4.sysctl_ping_group_range; } @@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_ping_group_range[0] = 1; net->ipv4.sysctl_ping_group_range[1] = 0; - net->ipv4.sysctl_rt_cache_rebuild_count = 4; - tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 581ecf02c6b5..5f6419341821 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -811,7 +811,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, old_size_goal + mss_now > xmit_size_goal)) { xmit_size_goal = old_size_goal; } else { - tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + tp->xmit_size_goal_segs = + min_t(u16, xmit_size_goal / mss_now, + sk->sk_gso_max_segs); xmit_size_goal = tp->xmit_size_goal_segs * mss_now; } } @@ -1760,8 +1762,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } #ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + if (tp->ucopy.dma_chan) { + if (tp->rcv_wnd == 0 && + !skb_queue_empty(&sk->sk_async_wait_queue)) { + tcp_service_net_dma(sk, true); + tcp_cleanup_rbuf(sk, copied); + } else + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + } #endif if (copied >= target) { /* Do not sleep, just process backlog. */ @@ -2323,10 +2331,17 @@ static int tcp_repair_options_est(struct tcp_sock *tp, tp->rx_opt.mss_clamp = opt.opt_val; break; case TCPOPT_WINDOW: - if (opt.opt_val > 14) - return -EFBIG; + { + u16 snd_wscale = opt.opt_val & 0xFFFF; + u16 rcv_wscale = opt.opt_val >> 16; + + if (snd_wscale > 14 || rcv_wscale > 14) + return -EFBIG; - tp->rx_opt.snd_wscale = opt.opt_val; + tp->rx_opt.snd_wscale = snd_wscale; + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rx_opt.wscale_ok = 1; + } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) @@ -2681,7 +2696,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Cap the max timeout in ms TCP will retry/retrans * before giving up and aborting (ETIMEDOUT) a connection. */ - icsk->icsk_user_timeout = msecs_to_jiffies(val); + if (val < 0) + err = -EINVAL; + else + icsk->icsk_user_timeout = msecs_to_jiffies(val); break; default: err = -ENOPROTOOPT; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4d4db16e336e..1432cdb0644c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,7 +291,8 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size) + left * tp->mss_cache < sk->sk_gso_max_size && + left < sk->sk_gso_max_segs) return true; return left <= tcp_max_tso_deferred_mss(tp); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e07a64ca44e..d377f4854cb8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2926,13 +2926,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int newly_acked_sacked, bool is_dupack, + int prior_sacked, bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); + int newly_acked_sacked = 0; int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -2992,6 +2993,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_add_reno_sack(sk); } else do_lost = tcp_try_undo_partial(sk, pkts_acked); + newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; break; case TCP_CA_Loss: if (flag & FLAG_DATA_ACKED) @@ -3013,6 +3015,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } + newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); @@ -3590,7 +3593,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets; int prior_sacked = tp->sacked_out; int pkts_acked = 0; - int newly_acked_sacked = 0; bool frto_cwnd = false; /* If the ack is older than previous acks @@ -3666,8 +3668,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); pkts_acked = prior_packets - tp->packets_out; - newly_acked_sacked = (prior_packets - prior_sacked) - - (tp->packets_out - tp->sacked_out); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3681,7 +3681,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) @@ -3698,7 +3698,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3718,8 +3718,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - newly_acked_sacked = tp->sacked_out - prior_sacked; - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } @@ -4351,19 +4350,20 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, size)) { + !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, size)) { + if (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - if (!sk_rmem_schedule(sk, size)) + if (!sk_rmem_schedule(sk, skb, size)) return -1; } } @@ -4418,7 +4418,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { + if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; @@ -4552,17 +4552,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; struct tcphdr *th; bool fragstolen; - if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) - goto err; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) goto err; + if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + goto err_free; + th = (struct tcphdr *)skb_put(skb, sizeof(*th)); skb_reset_transport_header(skb); memset(th, 0, sizeof(*th)); @@ -4633,7 +4633,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb->truesize)) + tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto drop; eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); @@ -4661,7 +4661,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); - else if (!sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; } @@ -5391,6 +5391,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + if (unlikely(sk->sk_rx_dst == NULL)) + inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous @@ -5475,7 +5477,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { #ifdef CONFIG_NET_DMA - if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && + tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } @@ -5552,8 +5556,7 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - else - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk, 0); return 0; } } @@ -5602,7 +5605,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3e30548ac32a..00a748d14062 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ tp->mtu_info = info; - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + sock_hold(sk); + } goto out; } @@ -1462,6 +1464,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; + inet_sk_rx_dst_set(newsk, skb); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1617,21 +1620,16 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; - if (dst->ops->check(dst, 0) == NULL) { + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, 0) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) { - struct inet_sock *icsk = inet_sk(sk); - struct rtable *rt = skb_rtable(skb); - - sk->sk_rx_dst = dst_clone(&rt->dst); - icsk->rx_dst_ifindex = inet_iif(skb); - } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1686,7 +1684,6 @@ void tcp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; - struct net_device *dev; struct sock *sk; if (skb->pkt_type != PACKET_HOST) @@ -1701,24 +1698,20 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return; - if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) - return; - - dev = skb->dev; sk = __inet_lookup_established(net, &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), - dev->ifindex); + skb->skb_iif); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { struct dst_entry *dst = sk->sk_rx_dst; - struct inet_sock *icsk = inet_sk(sk); + if (dst) dst = dst_check(dst, 0); if (dst && - icsk->rx_dst_ifindex == dev->ifindex) + inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } @@ -1879,10 +1872,21 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(inet_sk_rx_dst_set); + const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), @@ -2640,7 +2644,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM .init_cgroup = tcp_init_cgroup, .destroy_cgroup = tcp_destroy_cgroup, .proto_cgroup = tcp_proto_cgroup, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 2288a6399e1e..0abe67bb4d3a 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -731,6 +731,18 @@ static int __net_init tcp_net_metrics_init(struct net *net) static void __net_exit tcp_net_metrics_exit(struct net *net) { + unsigned int i; + + for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { + struct tcp_metrics_block *tm, *next; + + tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); + while (tm) { + next = rcu_dereference_protected(tm->tcpm_next, 1); + kfree(tm); + tm = next; + } + } kfree(net->ipv4.tcp_metrics_hash); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5912ac3fd240..6ff7f10dce9d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newsk->sk_rx_dst = dst_clone(skb_dst(skb)); - /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 33cd065cfbd8..d04632673a9e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); - - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { tcp_delack_timer_handler(sk); - - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { sk->sk_prot->mtu_reduced(sk); + __sock_put(sk); + } } EXPORT_SYMBOL(tcp_release_cb); @@ -940,7 +944,7 @@ void __init tcp_tasklet_init(void) * We cant xmit new skbs from this context, as we might already * hold qdisc lock. */ -void tcp_wfree(struct sk_buff *skb) +static void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); @@ -1522,21 +1526,21 @@ static void tcp_cwnd_validate(struct sock *sk) * when we would be allowed to send the split-due-to-Nagle skb fully. */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; return needed - needed % mss_now; } @@ -1765,7 +1769,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + sk->sk_gso_max_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1999,7 +2004,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs)); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -2045,7 +2052,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, if (unlikely(sk->sk_state == TCP_CLOSE)) return; - if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_atomic(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -2666,7 +2674,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) s_data_desired = cvp->s_data_desired; - skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, + sk_gfp_atomic(sk, GFP_ATOMIC)); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -3064,7 +3073,7 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; @@ -3079,7 +3088,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); + tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } /* This routine sends a packet with an out of date sequence @@ -3099,7 +3108,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (skb == NULL) return -1; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6df36ad55a38..b774a03bd1dc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); @@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b4c3582a991f..2814f66dac64 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -758,7 +758,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) uh->check = CSUM_MANGLED_0; send: - err = ip_send_skb(skb); + err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), @@ -1226,6 +1226,11 @@ try_again: if (unlikely(err)) { trace_kfree_skb(skb, udp_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } goto out_free; } diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 58d23a572509..06814b6216dc 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) if (skb_dst(skb) == NULL) { const struct iphdr *iph = ip_hdr(skb); - if (ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) goto drop; } return dst_input(skb); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c6281847f16a..681ea2f413e2 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 79181819a24f..6bc85f7c31e3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -494,8 +494,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -504,7 +503,6 @@ static void addrconf_forward_change(struct net *net, __s32 newf) dev_forward_change(idev); } } - rcu_read_unlock(); } static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 6dc7fd353ef5..282f3723ee19 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -167,8 +167,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct esp_data *esp = x->data; /* skb is pure payload to encrypt */ - err = -ENOMEM; - aead = esp->aead; alen = crypto_aead_authsize(aead); @@ -203,8 +201,10 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) } tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); - if (!tmp) + if (!tmp) { + err = -ENOMEM; goto error; + } seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 0251a6005be8..c4f934176cab 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -175,33 +175,12 @@ void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *saddr) { __ip6_dst_store(sk, dst, daddr, saddr); - -#ifdef CONFIG_XFRM - { - struct rt6_info *rt = (struct rt6_info *)dst; - rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); - } -#endif } static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { - struct dst_entry *dst; - - dst = __sk_dst_check(sk, cookie); - -#ifdef CONFIG_XFRM - if (dst) { - struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { - __sk_dst_reset(sk); - dst = NULL; - } - } -#endif - - return dst; + return __sk_dst_check(sk, cookie); } static struct dst_entry *inet6_csk_route_socket(struct sock *sk, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 13690d650c3e..286acfc21250 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -819,6 +819,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) offsetof(struct rt6_info, rt6i_src), allow_create, replace_required); + if (IS_ERR(sn)) { + err = PTR_ERR(sn); + sn = NULL; + } if (!sn) { /* If it is failed, discard just allocated root, and then (in st_failure) stale node diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 5ab923e51af3..a52d864d562b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,9 +47,16 @@ -inline int ip6_rcv_finish( struct sk_buff *skb) +int ip6_rcv_finish(struct sk_buff *skb) { - if (skb_dst(skb) == NULL) + if (sysctl_ip_early_demux && !skb_dst(skb)) { + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); + if (ipprot && ipprot->early_demux) + ipprot->early_demux(skb); + } + if (!skb_dst(skb)) ip6_route_input(skb); return dst_input(skb); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index da2e92d05c15..745a32042950 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -307,10 +307,10 @@ static int __net_init ipv6_proc_init_net(struct net *net) goto proc_dev_snmp6_fail; return 0; +proc_dev_snmp6_fail: + proc_net_remove(net, "snmp6"); proc_snmp6_fail: proc_net_remove(net, "sockstat6"); -proc_dev_snmp6_fail: - proc_net_remove(net, "dev_snmp6"); return -ENOMEM; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cf02cb97bbdd..854e4018d205 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -226,7 +226,7 @@ static struct rt6_info ip6_null_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .obsolete = -1, + .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, .input = ip6_pkt_discard, .output = ip6_pkt_discard_out, @@ -246,7 +246,7 @@ static struct rt6_info ip6_prohibit_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .obsolete = -1, + .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, .input = ip6_pkt_prohibit, .output = ip6_pkt_prohibit_out, @@ -261,7 +261,7 @@ static struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .obsolete = -1, + .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, .output = dst_discard, @@ -281,13 +281,14 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 0, DST_OBSOLETE_NONE, flags); + 0, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); + rt->rt6i_genid = rt_genid(net); } return rt; } @@ -1031,6 +1032,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt = (struct rt6_info *) dst; + /* All IPV6 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + */ + if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev))) + return NULL; + if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) { if (rt->rt6i_peer_genid != rt6_peer_genid()) { if (!rt6_has_peer(rt)) @@ -1397,8 +1405,6 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } - rt->dst.obsolete = -1; - if (cfg->fc_flags & RTF_EXPIRES) rt6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); @@ -2080,7 +2086,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_idev = idev; - rt->dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) @@ -2480,12 +2485,8 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) goto nla_put_failure; - if (!(rt->rt6i_flags & RTF_EXPIRES)) - expires = 0; - else if (rt->dst.expires - jiffies < INT_MAX) - expires = rt->dst.expires - jiffies; - else - expires = INT_MAX; + + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f49476e2d884..acd32e3f1b68 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -94,6 +94,18 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, } #endif +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -391,8 +403,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, + &tp->tsq_flags)) + sock_hold(sk); goto out; } @@ -1270,6 +1283,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(newsk, dst, NULL, NULL); + inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; @@ -1299,7 +1313,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; if (treq->pktopts != NULL) { - newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone(treq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); consume_skb(treq->pktopts); treq->pktopts = NULL; if (newnp->pktoptions) @@ -1349,7 +1364,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, - AF_INET6, key->key, key->keylen, GFP_ATOMIC); + AF_INET6, key->key, key->keylen, + sk_gfp_atomic(sk, GFP_ATOMIC)); } #endif @@ -1442,10 +1458,20 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) goto reset; if (opt_skb) @@ -1674,6 +1700,43 @@ do_time_wait: goto discard_it; } +static void tcp_v6_early_demux(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) + return; + + hdr = ipv6_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + inet6_iif(skb)); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + struct inet_sock *icsk = inet_sk(sk); + if (dst) + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + if (dst && + icsk->rx_dst_ifindex == skb->skb_iif) + skb_dst_set_noref(skb, dst); + } + } +} + static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1684,6 +1747,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), @@ -1715,6 +1779,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), @@ -1978,12 +2043,13 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM .proto_cgroup = tcp_proto_cgroup, #endif }; static const struct inet6_protocol tcpv6_protocol = { + .early_demux = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .gso_send_check = tcp_v6_gso_send_check, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 99d0077b56b8..07e2bfef6845 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -394,6 +394,17 @@ try_again: } if (unlikely(err)) { trace_kfree_skb(skb, udpv6_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + } goto out_free; } if (!peeked) { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef39812107b1..f8c4c08ffb60 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -73,6 +73,13 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) +{ + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); +} + static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -286,6 +293,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 393355d37b47..1a9f3723c13c 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1347,11 +1347,10 @@ static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) /* Remove from tunnel list */ spin_lock_bh(&pn->l2tp_tunnel_list_lock); list_del_rcu(&tunnel->list); + kfree_rcu(tunnel, rcu); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - synchronize_rcu(); atomic_dec(&l2tp_tunnel_count); - kfree(tunnel); } /* Create a socket for the tunnel, if one isn't set up by @@ -1502,6 +1501,8 @@ out: return err; } +static struct lock_class_key l2tp_socket_class; + int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; @@ -1606,6 +1607,8 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel->old_sk_destruct = sk->sk_destruct; sk->sk_destruct = &l2tp_tunnel_destruct; tunnel->sock = sk; + lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); + sk->sk_allocation = GFP_ATOMIC; /* Add tunnel to our list */ diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a38ec6cdeee1..56d583e083a7 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -163,6 +163,7 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; /* hashed list of sessions, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index f9ee74deeac2..3bfb34aaee29 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -153,7 +153,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length); } - if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) + if (!pskb_may_pull(skb, ETH_HLEN)) goto error; secpath_reset(skb); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 35e1e4bde587..927547171bc7 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -410,6 +410,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; + lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index f6fe4d400502..c2190005a114 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -969,14 +969,13 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - int rc = 0; + int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; *uaddrlen = sizeof(sllc); - memset(uaddr, 0, *uaddrlen); if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) @@ -1206,7 +1205,7 @@ static int __init llc2_init(void) rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); - goto out_unregister_llc_proto; + goto out_station; } rc = llc_sysctl_init(); if (rc) { @@ -1226,7 +1225,8 @@ out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); -out_unregister_llc_proto: +out_station: + llc_station_exit(); proto_unregister(&llc_proto); goto out; } diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index e32cab44ea95..dd3e83328ad5 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -42,6 +42,7 @@ static void (*llc_type_handlers[2])(struct llc_sap *sap, void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { + smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } @@ -50,11 +51,19 @@ void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; + synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { + /* Ensure initialisation is complete before it's called */ + if (handler) + smp_wmb(); + llc_station_handler = handler; + + if (!handler) + synchronize_net(); } /** @@ -150,6 +159,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); + void (*sta_handler)(struct sk_buff *skb); + void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); if (!net_eq(dev_net(dev), &init_net)) goto drop; @@ -182,7 +193,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); - if (unlikely(!dest || !llc_type_handlers[dest - 1])) { + sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL; + if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else @@ -193,7 +205,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, if (cskb) rcv(cskb, dev, pt, orig_dev); } - llc_type_handlers[dest - 1](sap, skb); + sap_handler(sap, skb); } llc_sap_put(sap); out: @@ -202,9 +214,10 @@ drop: kfree_skb(skb); goto out; handle_station: - if (!llc_station_handler) + sta_handler = ACCESS_ONCE(llc_station_handler); + if (!sta_handler) goto drop; - llc_station_handler(skb); + sta_handler(skb); goto out; } diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 39a8d8924b9c..b2f2bac2c2a2 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -268,7 +268,7 @@ static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb) out: return rc; free: - kfree_skb(skb); + kfree_skb(nskb); goto out; } @@ -293,7 +293,7 @@ static int llc_station_ac_send_xid_r(struct sk_buff *skb) out: return rc; free: - kfree_skb(skb); + kfree_skb(nskb); goto out; } @@ -322,7 +322,7 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb) out: return rc; free: - kfree_skb(skb); + kfree_skb(nskb); goto out; } @@ -687,12 +687,8 @@ static void llc_station_rcv(struct sk_buff *skb) llc_station_state_process(skb); } -int __init llc_station_init(void) +void __init llc_station_init(void) { - int rc = -ENOBUFS; - struct sk_buff *skb; - struct llc_station_state_ev *ev; - skb_queue_head_init(&llc_main_station.mac_pdu_q); skb_queue_head_init(&llc_main_station.ev_q.list); spin_lock_init(&llc_main_station.ev_q.lock); @@ -700,23 +696,12 @@ int __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) - goto out; - rc = 0; - llc_set_station_handler(llc_station_rcv); - ev = llc_station_ev(skb); - memset(ev, 0, sizeof(*ev)); llc_main_station.maximum_retry = 1; - llc_main_station.state = LLC_STATION_STATE_DOWN; - ev->type = LLC_STATION_EV_TYPE_SIMPLE; - ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; - rc = llc_station_next_state(skb); -out: - return rc; + llc_main_station.state = LLC_STATION_STATE_UP; + llc_set_station_handler(llc_station_rcv); } -void __exit llc_station_exit(void) +void llc_station_exit(void) { llc_set_station_handler(NULL); } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d41974aacf51..a58c0b649ba1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1378,6 +1378,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, else memset(next_hop, 0, ETH_ALEN); + memset(pinfo, 0, sizeof(*pinfo)); + pinfo->generation = mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | @@ -1396,7 +1398,6 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, pinfo->discovery_timeout = jiffies_to_msecs(mpath->discovery_timeout); pinfo->discovery_retries = mpath->discovery_retries; - pinfo->flags = 0; if (mpath->flags & MESH_PATH_ACTIVE) pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; if (mpath->flags & MESH_PATH_RESOLVING) @@ -1405,10 +1406,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; if (mpath->flags & MESH_PATH_FIXED) pinfo->flags |= NL80211_MPATH_FLAG_FIXED; - if (mpath->flags & MESH_PATH_RESOLVING) - pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; - - pinfo->flags = mpath->flags; + if (mpath->flags & MESH_PATH_RESOLVED) + pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 1bf7903496f8..bcffa6903129 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -276,7 +276,7 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) read_lock(&tpt_trig->trig.leddev_list_lock); list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) - led_brightness_set(led_cdev, LED_OFF); + led_set_brightness(led_cdev, LED_OFF); read_unlock(&tpt_trig->trig.leddev_list_lock); } diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6fac18c0423f..85572353a7e3 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -622,6 +622,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) del_timer_sync(&sdata->u.mesh.housekeeping_timer); del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); + del_timer_sync(&sdata->u.mesh.mesh_path_timer); /* * If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again @@ -634,6 +635,8 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) local->fif_other_bss--; atomic_dec(&local->iff_allmultis); ieee80211_configure_filter(local); + + sdata->u.mesh.timers_running = 0; } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cef0c9e79aba..f76b83341cf9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1430,6 +1430,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, del_timer_sync(&sdata->u.mgd.bcn_mon_timer); del_timer_sync(&sdata->u.mgd.timer); del_timer_sync(&sdata->u.mgd.chswitch_timer); + + sdata->u.mgd.timers_running = 0; } void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, @@ -3246,6 +3248,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, goto out_unlock; err_clear: + memset(ifmgd->bssid, 0, ETH_ALEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->auth_data = NULL; err_free: kfree(auth_data); @@ -3437,6 +3441,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, err = 0; goto out; err_clear: + memset(ifmgd->bssid, 0, ETH_ALEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->assoc_data = NULL; err_free: kfree(assoc_data); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index bcaee5d12839..839dd9737989 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -299,7 +299,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, if (local->scan_req != local->int_scan_req) cfg80211_scan_done(local->scan_req, aborted); local->scan_req = NULL; - local->scan_sdata = NULL; + rcu_assign_pointer(local->scan_sdata, NULL); local->scanning = 0; local->scan_channel = NULL; @@ -984,7 +984,6 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) kfree(local->sched_scan_ies.ie[i]); drv_sched_scan_stop(local, sdata); - rcu_assign_pointer(local->sched_scan_sdata, NULL); } out: mutex_unlock(&local->mtx); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index acf712ffb5e6..c5e8c9c31f76 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1811,37 +1811,31 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata, NULL, NULL); } else { - int is_mesh_mcast = 1; - const u8 *mesh_da; + /* DS -> MBSS (802.11-2012 13.11.3.3). + * For unicast with unknown forwarding information, + * destination might be in the MBSS or if that fails + * forwarded to another mesh gate. In either case + * resolution will be handled in ieee80211_xmit(), so + * leave the original DA. This also works for mcast */ + const u8 *mesh_da = skb->data; + + if (mppath) + mesh_da = mppath->mpp; + else if (mpath) + mesh_da = mpath->dst; + rcu_read_unlock(); - if (is_multicast_ether_addr(skb->data)) - /* DA TA mSA AE:SA */ - mesh_da = skb->data; - else { - static const u8 bcast[ETH_ALEN] = - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - if (mppath) { - /* RA TA mDA mSA AE:DA SA */ - mesh_da = mppath->mpp; - is_mesh_mcast = 0; - } else if (mpath) { - mesh_da = mpath->dst; - is_mesh_mcast = 0; - } else { - /* DA TA mSA AE:SA */ - mesh_da = bcast; - } - } hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, mesh_da, sdata->vif.addr); - rcu_read_unlock(); - if (is_mesh_mcast) + if (is_multicast_ether_addr(mesh_da)) + /* DA TA mSA AE:SA */ meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata, skb->data + ETH_ALEN, NULL); else + /* RA TA mDA mSA AE:DA SA */ meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 84444dda194b..f51013c07b9f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1171,8 +1171,10 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, goto out_err; } svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) + if (!svc->stats.cpustats) { + ret = -ENOMEM; goto out_err; + } /* I'm the first user of the service */ atomic_set(&svc->usecnt, 0); @@ -2759,6 +2761,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; + memset(&t, 0, sizeof(t)); __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cf4875565d67..2ceec64b19f9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -249,12 +249,15 @@ static void death_by_event(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ - ct->timeout.expires = jiffies + + ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + add_timer(&ecache->timeout); return; } /* we've got the event delivered, now it's dying */ @@ -268,6 +271,9 @@ static void death_by_event(unsigned long ul_conntrack) void nf_ct_insert_dying_list(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); /* add this conntrack to the dying list */ spin_lock_bh(&nf_conntrack_lock); @@ -275,10 +281,10 @@ void nf_ct_insert_dying_list(struct nf_conn *ct) &net->ct.dying); spin_unlock_bh(&nf_conntrack_lock); /* set a new timer to retry event delivery */ - setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); - ct->timeout.expires = jiffies + + setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); + ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + add_timer(&ecache->timeout); } EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 45cf602a76bc..527651a53a45 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -361,23 +361,6 @@ static void evict_oldest_expect(struct nf_conn *master, } } -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ - struct nf_conn_help *master_help = nfct_help(i->master); - const struct nf_conntrack_expect_policy *p; - - if (!del_timer(&i->timeout)) - return 0; - - p = &rcu_dereference_protected( - master_help->helper, - lockdep_is_held(&nf_conntrack_lock) - )->expect_policy[i->class]; - i->timeout.expires = jiffies + p->timeout * HZ; - add_timer(&i->timeout); - return 1; -} - static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) { const struct nf_conntrack_expect_policy *p; @@ -386,7 +369,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); - struct hlist_node *n; + struct hlist_node *n, *next; unsigned int h; int ret = 1; @@ -395,12 +378,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_ct_expect_put(i); + break; } } else if (expect_clash(i, expect)) { ret = -EBUSY; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 14f67a2cbcb5..9807f3278fcb 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1896,10 +1896,15 @@ static int ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; + int ret; nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); - return ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_lock_bh(&nf_conntrack_lock); + ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_unlock_bh(&nf_conntrack_lock); + + return ret; } static struct nfq_ct_hook ctnetlink_nfqueue_hook = { @@ -2785,7 +2790,8 @@ static int __init ctnetlink_init(void) goto err_unreg_subsys; } - if (register_pernet_subsys(&ctnetlink_net_ops)) { + ret = register_pernet_subsys(&ctnetlink_net_ops); + if (ret < 0) { pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index a5ac11ebef33..e046b3756aab 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -158,21 +158,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ -/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open - * sSR -> sIG - * sES -> sIG Error: SYNs in window outside the SYN_SENT state - * are errors. Receiver will reply with RST - * and close the connection. - * Or we are not in sync and hold a dead connection. - * sFW -> sIG - * sCW -> sIG - * sLA -> sIG - * sTW -> sIG - * sCL -> sIG + * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open + * sES -> sIV Invalid SYN/ACK packets sent by the client + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV + * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, @@ -633,15 +630,9 @@ static bool tcp_in_window(const struct nf_conn *ct, ack = sack = receiver->td_end; } - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. + * RST sent answering SYN. */ seq = end = sender->td_end; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 758a1bacc126..5c0a112aeee6 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -183,12 +183,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr, return len + digits_len(ct, dptr, limit, shift); } -static int parse_addr(const struct nf_conn *ct, const char *cp, - const char **endp, union nf_inet_addr *addr, - const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int ret = 0; + int ret; if (!ct) return 0; @@ -197,16 +197,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; @@ -219,7 +231,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -296,7 +308,7 @@ int ct_sip_parse_request(const struct nf_conn *ct, return 0; dptr += shift; - if (!parse_addr(ct, dptr, &end, addr, limit)) + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; if (end < limit && *end == ':') { end++; @@ -550,7 +562,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (ret == 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; if (*c == ':') { c++; @@ -599,7 +611,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr) + union nf_inet_addr *addr, bool delim) { const char *limit = dptr + datalen; const char *start, *end; @@ -613,7 +625,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - if (!parse_addr(ct, start, &end, addr, limit)) + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) return 0; *matchoff = start - dptr; *matchlen = end - start; @@ -675,6 +687,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, return 1; } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + /* SDP header parsing: a SDP session description contains an ordered set of * headers, starting with a section containing general session parameters, * optionally followed by multiple media descriptions. @@ -686,10 +739,10 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, */ static const struct sip_header ct_sdp_hdrs[] = { [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), - [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), - [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), - [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), - [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), }; @@ -775,8 +828,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, if (ret <= 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, NULL, addr, - dptr + *matchoff + *matchlen)) + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) return -1; return 1; } @@ -1515,7 +1568,6 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, } static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; -static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1585,9 +1637,9 @@ static int __init nf_conntrack_sip_init(void) sip[i][j].me = THIS_MODULE; if (ports[i] == SIP_PORT) - sprintf(sip_names[i][j], "sip"); + sprintf(sip[i][j].name, "sip"); else - sprintf(sip_names[i][j], "sip-%u", i); + sprintf(sip[i][j].name, "sip-%u", i); pr_debug("port #%u: %u\n", i, ports[i]); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 169ab59ed9d4..5cfb5bedb2b8 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -381,6 +381,7 @@ __build_packet_message(struct nfulnl_instance *inst, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; + struct sock *sk; nlh = nlmsg_put(inst->skb, 0, 0, NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, @@ -480,7 +481,7 @@ __build_packet_message(struct nfulnl_instance *inst, } if (indev && skb_mac_header_was_set(skb)) { - if (nla_put_be32(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || + if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || nla_put_be16(inst->skb, NFULA_HWLEN, htons(skb->dev->hard_header_len)) || nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, @@ -499,18 +500,19 @@ __build_packet_message(struct nfulnl_instance *inst, } /* UID */ - if (skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) { - struct file *file = skb->sk->sk_socket->file; + sk = skb->sk; + if (sk && sk->sk_state != TCP_TIME_WAIT) { + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + struct file *file = sk->sk_socket->file; __be32 uid = htonl(file->f_cred->fsuid); __be32 gid = htonl(file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); if (nla_put_be32(inst->skb, NFULA_UID, uid) || nla_put_be32(inst->skb, NFULA_GID, gid)) goto nla_put_failure; } else - read_unlock_bh(&skb->sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ @@ -996,8 +998,10 @@ static int __init nfnetlink_log_init(void) #ifdef CONFIG_PROC_FS if (!proc_create("nfnetlink_log", 0440, - proc_net_netfilter, &nful_file_ops)) + proc_net_netfilter, &nful_file_ops)) { + status = -ENOMEM; goto cleanup_logger; + } #endif return status; diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index ff5f75fddb15..91e9af4d1f42 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -145,6 +145,19 @@ static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, return 0; } +static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) +{ + if (!sk || sk->sk_state == TCP_TIME_WAIT) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) + sb_add(m, "UID=%u GID=%u ", + sk->sk_socket->file->f_cred->fsuid, + sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&sk->sk_callback_lock); +} + /* One level of recursion won't kill us */ static void dump_ipv4_packet(struct sbuff *m, const struct nf_loginfo *info, @@ -361,14 +374,8 @@ static void dump_ipv4_packet(struct sbuff *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && !iphoff && skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) - sb_add(m, "UID=%u GID=%u ", - skb->sk->sk_socket->file->f_cred->fsuid, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); - } + if ((logflags & XT_LOG_UID) && !iphoff) + dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -436,8 +443,8 @@ log_packet_common(struct sbuff *m, const struct nf_loginfo *loginfo, const char *prefix) { - sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, - prefix, + sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); #ifdef CONFIG_BRIDGE_NETFILTER @@ -717,14 +724,8 @@ static void dump_ipv6_packet(struct sbuff *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && recurse && skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) - sb_add(m, "UID=%u GID=%u ", - skb->sk->sk_socket->file->f_cred->fsuid, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); - } + if ((logflags & XT_LOG_UID) && recurse) + dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!recurse && skb->mark) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5463969da45b..527023823b5c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; @@ -1373,7 +1373,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_pid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; - if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + if ((dst_group || dst_pid) && + !netlink_capable(sock, NL_NONROOT_SEND)) goto out; } else { dst_pid = nlk->dst_pid; @@ -2147,6 +2148,7 @@ static void __init netlink_add_usersock_entry(void) rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; + nl_table[NETLINK_USERSOCK].nl_nonroot = NL_NONROOT_SEND; netlink_table_ungrab(); } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 06592d8b4a2b..7261eb81974f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -601,7 +601,7 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); release_sock(sk); - return -EACCES; + return -EPERM; } nr->user_addr = addr->fsa_digipeater[0]; nr->source_addr = addr->fsa_ax25.sax25_call; @@ -1169,7 +1169,12 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (er < 0) { + skb_free_datagram(sk, skb); + release_sock(sk); + return er; + } if (sax != NULL) { sax->sax25_family = AF_NETROM; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 320fa0e6951a..954405ceae9e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -45,7 +45,7 @@ static int make_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } -/* remove VLAN header from packet and update csum accrodingly. */ +/* remove VLAN header from packet and update csum accordingly. */ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) { struct vlan_hdr *vhdr; @@ -325,9 +325,6 @@ static int sample(struct datapath *dp, struct sk_buff *skb, } } - if (!acts_list) - return 0; - return do_execute_actions(dp, skb, nla_data(acts_list), nla_len(acts_list), true); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d8277d29e710..cf58cedad083 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -425,10 +425,10 @@ static int validate_sample(const struct nlattr *attr, static int validate_tp_port(const struct sw_flow_key *flow_key) { if (flow_key->eth.type == htons(ETH_P_IP)) { - if (flow_key->ipv4.tp.src && flow_key->ipv4.tp.dst) + if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) return 0; } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { - if (flow_key->ipv6.tp.src && flow_key->ipv6.tp.dst) + if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) return 0; } @@ -460,7 +460,7 @@ static int validate_set(const struct nlattr *a, if (flow_key->eth.type != htons(ETH_P_IP)) return -EINVAL; - if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst) + if (!flow_key->ip.proto) return -EINVAL; ipv4_key = nla_data(ovs_key); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 9b75617ca4e0..c30df1a10c67 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -145,15 +145,17 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies); * OVS_KEY_ATTR_PRIORITY 4 -- 4 8 * OVS_KEY_ATTR_IN_PORT 4 -- 4 8 * OVS_KEY_ATTR_ETHERNET 12 -- 4 16 + * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype) * OVS_KEY_ATTR_8021Q 4 -- 4 8 - * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 + * OVS_KEY_ATTR_ENCAP 0 -- 4 4 (VLAN encapsulation) + * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (inner VLAN ethertype) * OVS_KEY_ATTR_IPV6 40 -- 4 44 * OVS_KEY_ATTR_ICMPV6 2 2 4 8 * OVS_KEY_ATTR_ND 28 -- 4 32 * ------------------------------------------------- - * total 132 + * total 144 */ -#define FLOW_BUFSIZE 132 +#define FLOW_BUFSIZE 144 int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ceaca7c134a0..c5c9e2a54218 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1079,7 +1079,7 @@ static void *packet_current_rx_frame(struct packet_sock *po, default: WARN(1, "TPACKET version not supported\n"); BUG(); - return 0; + return NULL; } } @@ -1273,6 +1273,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } +static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +{ + if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + return true; + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1325,6 +1333,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } @@ -1936,7 +1945,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) if (likely(po->tx_ring.pg_vec)) { ph = skb_shinfo(skb)->destructor_arg; - BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); __packet_set_status(po, ph, TP_STATUS_AVAILABLE); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index f10fb8256442..05d60859d8e3 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -67,6 +67,9 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, struct tcf_common *pc; int ret = 0; int err; +#ifdef CONFIG_GACT_PROB + struct tc_gact_p *p_parm = NULL; +#endif if (nla == NULL) return -EINVAL; @@ -82,6 +85,12 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, #ifndef CONFIG_GACT_PROB if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; +#else + if (tb[TCA_GACT_PROB]) { + p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm->ptype >= MAX_RAND) + return -EINVAL; + } #endif pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); @@ -103,8 +112,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB - if (tb[TCA_GACT_PROB] != NULL) { - struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm) { gact->tcfg_paction = p_parm->paction; gact->tcfg_pval = p_parm->pval; gact->tcfg_ptype = p_parm->ptype; @@ -133,7 +141,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) + if (gact->tcfg_ptype) action = gact_rand[gact->tcfg_ptype](gact); else action = gact->tcf_action; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 60e281ad0f07..58fb3c7aab9e 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -185,7 +185,12 @@ err3: err2: kfree(tname); err1: - kfree(pc); + if (ret == ACT_P_CREATED) { + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); + } return err; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fe81cc18e9e0..9c0fd0c78814 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -200,13 +200,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, out: if (err) { m->tcf_qstats.overlimits++; - /* should we be asking for packet to be dropped? - * may make sense for redirect case only - */ - retval = TC_ACT_SHOT; - } else { + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + retval = TC_ACT_SHOT; + else + retval = m->tcf_action; + } else retval = m->tcf_action; - } spin_unlock(&m->tcf_lock); return retval; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 26aa2f6ce257..45c53ab067a6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -74,7 +74,10 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, p = to_pedit(pc); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - kfree(pc); + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); return -ENOMEM; } ret = ACT_P_CREATED; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3922f2a2821b..3714f60f0b3c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -131,7 +131,10 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, d = to_defact(pc); ret = alloc_defdata(d, defdata); if (ret < 0) { - kfree(pc); + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); return ret; } d->tcf_action = parm->action; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6aabd77d1cfd..564b9fc8efd3 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; - if (cl == NULL || cl->level >= head->level) + if (cl == NULL) goto fallback; } - + if (cl->level >= head->level) + goto fallback; #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9fc1c62ec80e..4e606fcb2534 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -191,7 +191,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); - codel_vars_init(&flow->cvars); q->new_flow_count++; flow->deficit = q->quantum; flow->dropped = 0; @@ -418,6 +417,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) struct fq_codel_flow *flow = q->flows + i; INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); } } if (sch->limit >= 1) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index e901583e4ea5..d42234c0f13b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -102,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch) if (q == NULL) continue; - for (n = 0; n < table->DPs; n++) - if (table->tab[n] && table->tab[n] != q && - table->tab[n]->prio == q->prio) + for (n = i + 1; n < table->DPs; n++) + if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } @@ -137,6 +136,7 @@ static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { table->wred_set.qavg = q->vars.qavg; + table->wred_set.qidlestart = q->vars.qidlestart; } static inline int gred_use_ecn(struct gred_sched *t) @@ -176,7 +176,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - /* sum up all the qaves of prios <= to ours to get the new qave */ + /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; @@ -260,16 +260,18 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) } else { q->backlog -= qdisc_pkt_len(skb); - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } return skb; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return NULL; } @@ -291,19 +293,20 @@ static unsigned int gred_drop(struct Qdisc *sch) q->backlog -= len; q->stats.other++; - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } qdisc_drop(skb, sch); return len; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return 0; - } static void gred_reset(struct Qdisc *sch) @@ -535,6 +538,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; + unsigned long qavg; memset(&opt, 0, sizeof(opt)); @@ -566,7 +570,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (gred_wred_mode(table)) gred_load_wred_set(table, q); - opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); + qavg = red_calc_qavg(&q->parms, &q->vars, + q->vars.qavg >> q->parms.Wlog); + opt.qave = qavg >> q->parms.Wlog; append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 9af01f3df18c..211a21217045 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -203,6 +203,34 @@ out: return index; } +/* Length of the next packet (0 if the queue is empty). */ +static unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = sch->ops->peek(sch); + return skb ? qdisc_pkt_len(skb) : 0; +} + +static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); +static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, + unsigned int len); + +static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, + u32 lmax, u32 inv_w, int delta_w) +{ + int i; + + /* update qfq-specific data */ + cl->lmax = lmax; + cl->inv_w = inv_w; + i = qfq_calc_index(cl->inv_w, cl->lmax); + + cl->grp = &q->groups[i]; + + q->wsum += delta_w; +} + static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { @@ -250,6 +278,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, lmax = 1UL << QFQ_MTU_SHIFT; if (cl != NULL) { + bool need_reactivation = false; + if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), @@ -258,12 +288,29 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return err; } - if (inv_w != cl->inv_w) { - sch_tree_lock(sch); - q->wsum += delta_w; - cl->inv_w = inv_w; - sch_tree_unlock(sch); + if (lmax == cl->lmax && inv_w == cl->inv_w) + return 0; /* nothing to update */ + + i = qfq_calc_index(inv_w, lmax); + sch_tree_lock(sch); + if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { + /* + * shift cl->F back, to not charge the + * class for the not-yet-served head + * packet + */ + cl->F = cl->S; + /* remove class from its slot in the old group */ + qfq_deactivate_class(q, cl); + need_reactivation = true; } + + qfq_update_class_params(q, cl, lmax, inv_w, delta_w); + + if (need_reactivation) /* activate in new group */ + qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc)); + sch_tree_unlock(sch); + return 0; } @@ -273,11 +320,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->refcnt = 1; cl->common.classid = classid; - cl->lmax = lmax; - cl->inv_w = inv_w; - i = qfq_calc_index(cl->inv_w, cl->lmax); - cl->grp = &q->groups[i]; + qfq_update_class_params(q, cl, lmax, inv_w, delta_w); cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); @@ -294,7 +338,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return err; } } - q->wsum += weight; sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); @@ -711,15 +754,6 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) } } -/* What is length of next packet in queue (0 if queue is empty) */ -static unsigned int qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - - skb = sch->ops->peek(sch); - return skb ? qdisc_pkt_len(skb) : 0; -} - /* * Updates the class, returns true if also the group needs to be updated. */ @@ -831,7 +865,10 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; + if (qfq_gt(limit, next->F)) + cl->S = next->F; + else /* preserve timestamp correctness */ + cl->S = limit; return; } } @@ -843,11 +880,8 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; struct qfq_class *cl; int err; - u64 roundedS; - int s; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -876,11 +910,25 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; /* If reach this point, queue q was idle */ - grp = cl->grp; + qfq_activate_class(q, cl, qdisc_pkt_len(skb)); + + return err; +} + +/* + * Handle class switch from idle to backlogged. + */ +static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, + unsigned int pkt_len) +{ + struct qfq_group *grp = cl->grp; + u64 roundedS; + int s; + qfq_update_start(q, cl); /* compute new finish time and rounded start. */ - cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w; + cl->F = cl->S + (u64)pkt_len * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); /* @@ -917,8 +965,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) skip_update: qfq_slot_insert(grp, cl, roundedS); - - return err; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 838e18b4d7ea..be50aa234dcd 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -364,6 +364,25 @@ finish: return retval; } +static void sctp_packet_release_owner(struct sk_buff *skb) +{ + sk_free(skb->sk); +} + +static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sctp_packet_release_owner; + + /* + * The data chunks have already been accounted for in sctp_sendmsg(), + * therefore only reserve a single byte to keep socket around until + * the packet has been transmitted. + */ + atomic_inc(&sk->sk_wmem_alloc); +} + /* All packets are sent to the network through this function from * sctp_outq_tail(). * @@ -405,7 +424,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) /* Set the owning socket so that we know where to get the * destination IP address. */ - skb_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(nskb, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 33d894776192..10c018a5b9fe 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (rx_count >= asoc->base.sk->sk_rcvbuf) { if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) + (!sk_rmem_schedule(asoc->base.sk, chunk->skb, + chunk->skb->truesize))) goto fail; } diff --git a/net/socket.c b/net/socket.c index dfe5b66c97e0..edc3c4af9085 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2604,7 +2604,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock, err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); set_fs(old_fs); if (!err) - err = compat_put_timeval(up, &ktv); + err = compat_put_timeval(&ktv, up); return err; } @@ -2620,7 +2620,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock, err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); set_fs(old_fs); if (!err) - err = compat_put_timespec(up, &kts); + err = compat_put_timespec(&kts, up); return err; } @@ -2657,6 +2657,7 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) return -EFAULT; + memset(&ifc, 0, sizeof(ifc)); if (ifc32.ifcbuf == 0) { ifc32.ifc_len = 0; ifc.ifc_len = 0; diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 9fe8857d8d59..03d03e37a7d5 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_SWAP + bool + depends on SUNRPC + select NETVM + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" depends on SUNRPC && CRYPTO diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 727e506cacda..b5c067bccc45 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -13,6 +13,7 @@ #include <linux/errno.h> #include <linux/hash.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/gss_api.h> #include <linux/spinlock.h> #ifdef RPC_DEBUG @@ -122,6 +123,59 @@ rpcauth_unregister(const struct rpc_authops *ops) } EXPORT_SYMBOL_GPL(rpcauth_unregister); +/** + * rpcauth_list_flavors - discover registered flavors and pseudoflavors + * @array: array to fill in + * @size: size of "array" + * + * Returns the number of array items filled in, or a negative errno. + * + * The returned array is not sorted by any policy. Callers should not + * rely on the order of the items in the returned array. + */ +int +rpcauth_list_flavors(rpc_authflavor_t *array, int size) +{ + rpc_authflavor_t flavor; + int result = 0; + + spin_lock(&rpc_authflavor_lock); + for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { + const struct rpc_authops *ops = auth_flavors[flavor]; + rpc_authflavor_t pseudos[4]; + int i, len; + + if (result >= size) { + result = -ENOMEM; + break; + } + + if (ops == NULL) + continue; + if (ops->list_pseudoflavors == NULL) { + array[result++] = ops->au_flavor; + continue; + } + len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos)); + if (len < 0) { + result = len; + break; + } + for (i = 0; i < len; i++) { + if (result >= size) { + result = -ENOMEM; + break; + } + array[result++] = pseudos[i]; + } + } + spin_unlock(&rpc_authflavor_lock); + + dprintk("RPC: %s returns %d\n", __func__, result); + return result; +} +EXPORT_SYMBOL_GPL(rpcauth_list_flavors); + struct rpc_auth * rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) { diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d3ad81f8da5b..34c522021004 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1619,6 +1619,7 @@ static const struct rpc_authops authgss_ops = { .crcreate = gss_create_cred, .pipes_create = gss_pipes_dentries_create, .pipes_destroy = gss_pipes_dentries_destroy, + .list_pseudoflavors = gss_mech_list_pseudoflavors, }; static const struct rpc_credops gss_credops = { diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 782bfe1b6465..b174fcd9ff4c 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -239,14 +239,28 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); -int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) +/** + * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors + * @array: array to fill in + * @size: size of "array" + * + * Returns the number of array items filled in, or a negative errno. + * + * The returned array is not sorted by any policy. Callers should not + * rely on the order of the items in the returned array. + */ +int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) { struct gss_api_mech *pos = NULL; int j, i = 0; spin_lock(®istered_mechs_lock); list_for_each_entry(pos, ®istered_mechs, gm_list) { - for (j=0; j < pos->gm_pf_num; j++) { + for (j = 0; j < pos->gm_pf_num; j++) { + if (i >= size) { + spin_unlock(®istered_mechs_lock); + return -ENOMEM; + } array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; } } @@ -254,8 +268,6 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) return i; } -EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors); - u32 gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) { diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 47ad2666fdf6..2afd2a84dc35 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1349,8 +1349,11 @@ static int c_show(struct seq_file *m, void *p) if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ seq_printf(m, "# "); - else + else { + if (cache_is_expired(cd, cp)) + seq_printf(m, "# "); cache_put(cp, cd); + } return cd->cache_show(m, cd, cp); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 00eb859b7de5..fa48c60aef23 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -717,6 +717,15 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) atomic_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; + if (sk_memalloc_socks()) { + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + if (xprt->swapper) + task->tk_flags |= RPC_TASK_SWAPPER; + rcu_read_unlock(); + } /* Add to the client's list of all tasks */ spin_lock(&clnt->cl_lock); list_add_tail(&task->tk_task, &clnt->cl_tasks); @@ -1844,12 +1853,13 @@ call_timeout(struct rpc_task *task) return; } if (RPC_IS_SOFT(task)) { - if (clnt->cl_chatty) + if (clnt->cl_chatty) { rcu_read_lock(); printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, rcu_dereference(clnt->cl_xprt)->servername); rcu_read_unlock(); + } if (task->tk_flags & RPC_TASK_TIMEOUT) rpc_exit(task, -ETIMEDOUT); else diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 92509ffe15fc..a70acae496e4 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -251,7 +251,7 @@ static int rpcb_create_local_unix(struct net *net) if (IS_ERR(clnt)) { dprintk("RPC: failed to create AF_LOCAL rpcbind " "client (errno %ld).\n", PTR_ERR(clnt)); - result = -PTR_ERR(clnt); + result = PTR_ERR(clnt); goto out; } @@ -298,7 +298,7 @@ static int rpcb_create_local_net(struct net *net) if (IS_ERR(clnt)) { dprintk("RPC: failed to create local rpcbind " "client (errno %ld).\n", PTR_ERR(clnt)); - result = -PTR_ERR(clnt); + result = PTR_ERR(clnt); goto out; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 994cfea2bad6..128494ec9a64 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -300,8 +300,9 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); /* * Make an RPC task runnable. * - * Note: If the task is ASYNC, this must be called with - * the spinlock held to protect the wait queue operation. + * Note: If the task is ASYNC, and is being made runnable after sitting on an + * rpc_wait_queue, this must be called with the queue spinlock held to protect + * the wait queue operation. */ static void rpc_make_runnable(struct rpc_task *task) { @@ -790,7 +791,9 @@ void rpc_execute(struct rpc_task *task) static void rpc_async_schedule(struct work_struct *work) { + current->flags |= PF_FSTRANS; __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); + current->flags &= ~PF_FSTRANS; } /** @@ -812,7 +815,10 @@ static void rpc_async_schedule(struct work_struct *work) void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; + gfp_t gfp = GFP_NOWAIT; + + if (RPC_IS_SWAPPER(task)) + gfp |= __GFP_MEMALLOC; size += sizeof(struct rpc_buffer); if (size <= RPC_BUFFER_MAXSIZE) @@ -886,7 +892,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta static struct rpc_task * rpc_alloc_task(void) { - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); } /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 88f2bf671960..bac973a31367 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -316,7 +316,6 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) */ void svc_xprt_enqueue(struct svc_xprt *xprt) { - struct svc_serv *serv = xprt->xpt_server; struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; @@ -362,8 +361,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) rqstp, rqstp->rq_xprt); rqstp->rq_xprt = xprt; svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); pool->sp_stats.threads_woken++; wake_up(&rqstp->rq_wait); } else { @@ -640,8 +637,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (xprt) { rqstp->rq_xprt = xprt; svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); /* As there is a shortage of threads and this request * had to be queued, don't allow the thread to wait so @@ -738,6 +733,8 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) else len = xprt->xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } svc_xprt_received(xprt); @@ -794,7 +791,8 @@ int svc_send(struct svc_rqst *rqstp) /* Grab mutex to serialize outgoing data. */ mutex_lock(&xprt->xpt_mutex); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + if (test_bit(XPT_DEAD, &xprt->xpt_flags) + || test_bit(XPT_CLOSE, &xprt->xpt_flags)) len = -ENOTCONN; else len = xprt->xpt_ops->xpo_sendto(rqstp); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 18bc130255a7..998aa8c1807c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1129,9 +1129,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len >= 0) svsk->sk_tcplen += len; if (len != want) { + svc_tcp_save_pages(svsk, rqstp); if (len < 0 && len != -EAGAIN) goto err_other; - svc_tcp_save_pages(svsk, rqstp); dprintk("svc: incomplete TCP record (%d of %d)\n", svsk->sk_tcplen, svsk->sk_reclen); goto err_noclose; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 0cf165580d8d..0afba1b4b656 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -129,34 +129,6 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) EXPORT_SYMBOL_GPL(xdr_terminate_string); void -xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, - unsigned int len) -{ - struct kvec *tail = xdr->tail; - u32 *p; - - xdr->pages = pages; - xdr->page_base = base; - xdr->page_len = len; - - p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len); - tail->iov_base = p; - tail->iov_len = 0; - - if (len & 3) { - unsigned int pad = 4 - (len & 3); - - *p = 0; - tail->iov_base = (char *)p + (len & 3); - tail->iov_len = pad; - len += pad; - } - xdr->buflen += len; - xdr->len += len; -} -EXPORT_SYMBOL_GPL(xdr_encode_pages); - -void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len) { @@ -457,6 +429,16 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len) EXPORT_SYMBOL_GPL(xdr_shift_buf); /** + * xdr_stream_pos - Return the current offset from the start of the xdr_stream + * @xdr: pointer to struct xdr_stream + */ +unsigned int xdr_stream_pos(const struct xdr_stream *xdr) +{ + return (unsigned int)(XDR_QUADLEN(xdr->buf->len) - xdr->nwords) << 2; +} +EXPORT_SYMBOL_GPL(xdr_stream_pos); + +/** * xdr_init_encode - Initialize a struct xdr_stream for sending data. * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer in which to encode data @@ -556,13 +538,11 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b EXPORT_SYMBOL_GPL(xdr_write_pages); static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, - __be32 *p, unsigned int len) + unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; - if (p == NULL) - p = (__be32*)iov->iov_base; - xdr->p = p; + xdr->p = (__be32*)iov->iov_base; xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; @@ -609,7 +589,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase -= xdr->buf->page_base; if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len); + xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) @@ -618,7 +598,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr) xdr_set_next_page(xdr); else if (xdr->iov == xdr->buf->head) { if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len); + xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); } return xdr->p != xdr->end; } @@ -634,10 +614,15 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) xdr->buf = buf; xdr->scratch.iov_base = NULL; xdr->scratch.iov_len = 0; + xdr->nwords = XDR_QUADLEN(buf->len); if (buf->head[0].iov_len != 0) - xdr_set_iov(xdr, buf->head, p, buf->len); + xdr_set_iov(xdr, buf->head, buf->len); else if (buf->page_len != 0) xdr_set_page_base(xdr, 0, buf->len); + if (p != NULL && p > xdr->p && xdr->end >= p) { + xdr->nwords -= p - xdr->p; + xdr->p = p; + } } EXPORT_SYMBOL_GPL(xdr_init_decode); @@ -662,12 +647,14 @@ EXPORT_SYMBOL_GPL(xdr_init_decode_pages); static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { + unsigned int nwords = XDR_QUADLEN(nbytes); __be32 *p = xdr->p; - __be32 *q = p + XDR_QUADLEN(nbytes); + __be32 *q = p + nwords; - if (unlikely(q > xdr->end || q < p)) + if (unlikely(nwords > xdr->nwords || q > xdr->end || q < p)) return NULL; xdr->p = q; + xdr->nwords -= nwords; return p; } @@ -734,6 +721,31 @@ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) } EXPORT_SYMBOL_GPL(xdr_inline_decode); +static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int cur = xdr_stream_pos(xdr); + + if (xdr->nwords == 0) + return 0; + if (nwords > xdr->nwords) { + nwords = xdr->nwords; + len = nwords << 2; + } + /* Realign pages to current pointer position */ + iov = buf->head; + if (iov->iov_len > cur) + xdr_shrink_bufhead(buf, iov->iov_len - cur); + + /* Truncate page data and move it into the tail */ + if (buf->page_len > len) + xdr_shrink_pagelen(buf, buf->page_len - len); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + return len; +} + /** * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position * @xdr: pointer to xdr_stream struct @@ -742,39 +754,37 @@ EXPORT_SYMBOL_GPL(xdr_inline_decode); * Moves data beyond the current pointer position from the XDR head[] buffer * into the page list. Any data that lies beyond current position + "len" * bytes is moved into the XDR tail[]. + * + * Returns the number of XDR encoded bytes now contained in the pages */ -void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) +unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; struct kvec *iov; - ssize_t shift; + unsigned int nwords; unsigned int end; - int padding; + unsigned int padding; - /* Realign pages to current pointer position */ - iov = buf->head; - shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p; - if (shift > 0) - xdr_shrink_bufhead(buf, shift); - - /* Truncate page data and move it into the tail */ - if (buf->page_len > len) - xdr_shrink_pagelen(buf, buf->page_len - len); - padding = (XDR_QUADLEN(len) << 2) - len; + len = xdr_align_pages(xdr, len); + if (len == 0) + return 0; + nwords = XDR_QUADLEN(len); + padding = (nwords << 2) - len; xdr->iov = iov = buf->tail; /* Compute remaining message length. */ - end = iov->iov_len; - shift = buf->buflen - buf->len; - if (shift < end) - end -= shift; - else if (shift > 0) - end = 0; + end = ((xdr->nwords - nwords) << 2) + padding; + if (end > iov->iov_len) + end = iov->iov_len; + /* * Position current pointer at beginning of tail, and * set remaining message length. */ xdr->p = (__be32 *)((char *)iov->iov_base + padding); xdr->end = (__be32 *)((char *)iov->iov_base + end); + xdr->page_ptr = NULL; + xdr->nwords = XDR_QUADLEN(end - padding); + return len; } EXPORT_SYMBOL_GPL(xdr_read_pages); @@ -790,12 +800,13 @@ EXPORT_SYMBOL_GPL(xdr_read_pages); */ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) { - xdr_read_pages(xdr, len); + len = xdr_align_pages(xdr, len); /* * Position current pointer at beginning of tail, and * set remaining message length. */ - xdr_set_page_base(xdr, 0, len); + if (len != 0) + xdr_set_page_base(xdr, 0, len); } EXPORT_SYMBOL_GPL(xdr_enter_page); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a5a402a7d21f..5d7f61d7559c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -969,11 +969,11 @@ static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) return false; } -static void xprt_alloc_slot(struct rpc_task *task) +void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; + spin_lock(&xprt->reserve_lock); if (!list_empty(&xprt->free)) { req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); list_del(&req->rq_list); @@ -994,12 +994,29 @@ static void xprt_alloc_slot(struct rpc_task *task) default: task->tk_status = -EAGAIN; } + spin_unlock(&xprt->reserve_lock); return; out_init_req: task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); + spin_unlock(&xprt->reserve_lock); +} +EXPORT_SYMBOL_GPL(xprt_alloc_slot); + +void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + /* Note: grabbing the xprt_lock_write() ensures that we throttle + * new slot allocation if the transport is congested (i.e. when + * reconnecting a stream transport or when out of socket write + * buffer space). + */ + if (xprt_lock_write(xprt, task)) { + xprt_alloc_slot(xprt, task); + xprt_release_write(xprt, task); + } } +EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { @@ -1083,20 +1100,9 @@ void xprt_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - /* Note: grabbing the xprt_lock_write() here is not strictly needed, - * but ensures that we throttle new slot allocation if the transport - * is congested (e.g. if reconnecting or if we're out of socket - * write buffer space). - */ task->tk_timeout = 0; task->tk_status = -EAGAIN; - if (!xprt_lock_write(xprt, task)) - return; - - spin_lock(&xprt->reserve_lock); - xprt_alloc_slot(task); - spin_unlock(&xprt->reserve_lock); - xprt_release_write(xprt, task); + xprt->ops->alloc_slot(xprt, task); } static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index b446e100286f..5d9202dc7cb1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -200,6 +200,7 @@ xprt_rdma_connect_worker(struct work_struct *work) int rc = 0; if (!xprt->shutdown) { + current->flags |= PF_FSTRANS; xprt_clear_connected(xprt); dprintk("RPC: %s: %sconnect\n", __func__, @@ -212,10 +213,10 @@ xprt_rdma_connect_worker(struct work_struct *work) out: xprt_wake_pending_tasks(xprt, rc); - out_clear: dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); + current->flags &= ~PF_FSTRANS; } /* @@ -712,6 +713,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) static struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_rdma_reserve_xprt, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 62d0dac8f780..a35b8e52e551 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1892,6 +1892,8 @@ static void xs_local_setup_socket(struct work_struct *work) if (xprt->shutdown) goto out; + current->flags |= PF_FSTRANS; + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); @@ -1925,7 +1927,47 @@ static void xs_local_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + current->flags &= ~PF_FSTRANS; +} + +#ifdef CONFIG_SUNRPC_SWAP +static void xs_set_memalloc(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, + xprt); + + if (xprt->swapper) + sk_set_memalloc(transport->inet); +} + +/** + * xs_swapper - Tag this transport as being used for swap. + * @xprt: transport to tag + * @enable: enable/disable + * + */ +int xs_swapper(struct rpc_xprt *xprt, int enable) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, + xprt); + int err = 0; + + if (enable) { + xprt->swapper++; + xs_set_memalloc(xprt); + } else if (xprt->swapper) { + xprt->swapper--; + sk_clear_memalloc(transport->inet); + } + + return err; +} +EXPORT_SYMBOL_GPL(xs_swapper); +#else +static void xs_set_memalloc(struct rpc_xprt *xprt) +{ } +#endif static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { @@ -1951,6 +1993,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) transport->sock = sock; transport->inet = sk; + xs_set_memalloc(xprt); + write_unlock_bh(&sk->sk_callback_lock); } xs_udp_do_set_buffer_size(xprt); @@ -1967,6 +2011,8 @@ static void xs_udp_setup_socket(struct work_struct *work) if (xprt->shutdown) goto out; + current->flags |= PF_FSTRANS; + /* Start by resetting any existing state */ xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, @@ -1985,6 +2031,7 @@ static void xs_udp_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + current->flags &= ~PF_FSTRANS; } /* @@ -2075,6 +2122,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!xprt_bound(xprt)) goto out; + xs_set_memalloc(xprt); + /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; @@ -2110,6 +2159,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) if (xprt->shutdown) goto out; + current->flags |= PF_FSTRANS; + if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); sock = xs_create_sock(xprt, transport, @@ -2159,6 +2210,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_clear_connecting(xprt); + current->flags &= ~PF_FSTRANS; return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2171,6 +2223,7 @@ out_eagain: out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + current->flags &= ~PF_FSTRANS; } /** @@ -2420,6 +2473,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_connect, @@ -2436,6 +2490,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2453,6 +2508,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_lock_and_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 79981d97bc9c..c5ee4ff61364 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -823,6 +823,34 @@ fail: return NULL; } +static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +{ + struct dentry *dentry; + struct path path; + int err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + return err; + + /* + * All right, let's create it. + */ + err = security_path_mknod(&path, dentry, mode, 0); + if (!err) { + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + if (!err) { + res->mnt = mntget(path.mnt); + res->dentry = dget(dentry); + } + } + done_path_create(&path, dentry); + return err; +} static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { @@ -831,8 +859,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - struct dentry *dentry = NULL; - struct path path; int err; unsigned int hash; struct unix_address *addr; @@ -869,43 +895,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) atomic_set(&addr->refcnt, 1); if (sun_path[0]) { - umode_t mode; - err = 0; - /* - * Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_mknod_parent; - - /* - * All right, let's create it. - */ - mode = S_IFSOCK | + struct path path; + umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = mnt_want_write(path.mnt); - if (err) - goto out_mknod_dput; - err = security_path_mknod(&path, dentry, mode, 0); - if (err) - goto out_mknod_drop_write; - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); -out_mknod_drop_write: - mnt_drop_write(path.mnt); - if (err) - goto out_mknod_dput; - mutex_unlock(&path.dentry->d_inode->i_mutex); - dput(path.dentry); - path.dentry = dentry; - + err = unix_mknod(sun_path, mode, &path); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + unix_release_addr(addr); + goto out_up; + } addr->hash = UNIX_HASH_SIZE; - } - - spin_lock(&unix_table_lock); - - if (!sun_path[0]) { + hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + spin_lock(&unix_table_lock); + u->path = path; + list = &unix_socket_table[hash]; + } else { + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { @@ -914,9 +920,6 @@ out_mknod_drop_write: } list = &unix_socket_table[addr->hash]; - } else { - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->path = path; } err = 0; @@ -930,16 +933,6 @@ out_up: mutex_unlock(&u->readlock); out: return err; - -out_mknod_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); -out_mknod_parent: - if (err == -EEXIST) - err = -EADDRINUSE; - unix_release_addr(addr); - goto out_up; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) @@ -1457,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1626,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 788a12c1eb5d..2ab785064b7e 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -602,36 +602,31 @@ static int wanrouter_device_new_if(struct wan_device *wandev, * successfully, add it to the interface list. */ - if (dev->name == NULL) { - err = -EINVAL; - } else { +#ifdef WANDEBUG + printk(KERN_INFO "%s: registering interface %s...\n", + wanrouter_modname, dev->name); +#endif - #ifdef WANDEBUG - printk(KERN_INFO "%s: registering interface %s...\n", - wanrouter_modname, dev->name); - #endif - - err = register_netdev(dev); - if (!err) { - struct net_device *slave = NULL; - unsigned long smp_flags=0; - - lock_adapter_irq(&wandev->lock, &smp_flags); - - if (wandev->dev == NULL) { - wandev->dev = dev; - } else { - for (slave=wandev->dev; - DEV_TO_SLAVE(slave); - slave = DEV_TO_SLAVE(slave)) - DEV_TO_SLAVE(slave) = dev; - } - ++wandev->ndev; - - unlock_adapter_irq(&wandev->lock, &smp_flags); - err = 0; /* done !!! */ - goto out; + err = register_netdev(dev); + if (!err) { + struct net_device *slave = NULL; + unsigned long smp_flags=0; + + lock_adapter_irq(&wandev->lock, &smp_flags); + + if (wandev->dev == NULL) { + wandev->dev = dev; + } else { + for (slave=wandev->dev; + DEV_TO_SLAVE(slave); + slave = DEV_TO_SLAVE(slave)) + DEV_TO_SLAVE(slave) = dev; } + ++wandev->ndev; + + unlock_adapter_irq(&wandev->lock, &smp_flags); + err = 0; /* done !!! */ + goto out; } if (wandev->del_if) wandev->del_if(wandev, dev); diff --git a/net/wireless/core.c b/net/wireless/core.c index 31b40cc4a9c3..dcd64d5b07aa 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -952,6 +952,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, */ synchronize_rcu(); INIT_LIST_HEAD(&wdev->list); + /* + * Ensure that all events have been processed and + * freed. + */ + cfg80211_process_wdev_events(wdev); break; case NETDEV_PRE_UP: if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) diff --git a/net/wireless/core.h b/net/wireless/core.h index 5206c6844fd7..bc7430b54771 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -426,6 +426,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, u32 *flags, struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); +void cfg80211_process_wdev_events(struct wireless_dev *wdev); int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 97026f3b215a..1e37dbf00cb3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5633,8 +5633,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.ht_capa_mask)); if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) { - if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) + if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) { + kfree(connkeys); return -EINVAL; + } memcpy(&connect.ht_capa, nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]), sizeof(connect.ht_capa)); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2303ee73b50a..2ded3c7fad06 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -680,6 +680,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_IBSS; if (rd_flags & NL80211_RRF_DFS) channel_flags |= IEEE80211_CHAN_RADAR; + if (rd_flags & NL80211_RRF_NO_OFDM) + channel_flags |= IEEE80211_CHAN_NO_OFDM; return channel_flags; } @@ -901,7 +903,21 @@ static void handle_channel(struct wiphy *wiphy, chan->max_antenna_gain = min(chan->orig_mag, (int) MBI_TO_DBI(power_rule->max_antenna_gain)); chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); - chan->max_power = min(chan->max_power, chan->max_reg_power); + if (chan->orig_mpwr) { + /* + * Devices that have their own custom regulatory domain + * but also use WIPHY_FLAG_STRICT_REGULATORY will follow the + * passed country IE power settings. + */ + if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY && + wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) + chan->max_power = chan->max_reg_power; + else + chan->max_power = min(chan->orig_mpwr, + chan->max_reg_power); + } else + chan->max_power = chan->max_reg_power; } static void handle_band(struct wiphy *wiphy, @@ -1885,6 +1901,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy) chan->flags = chan->orig_flags; chan->max_antenna_gain = chan->orig_mag; chan->max_power = chan->orig_mpwr; + chan->beacon_found = false; } } } diff --git a/net/wireless/util.c b/net/wireless/util.c index 26f8cd30f712..994e2f0cc7a8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -735,7 +735,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) wdev->connect_keys = NULL; } -static void cfg80211_process_wdev_events(struct wireless_dev *wdev) +void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 54a0dc2e2f8d..ab2bb42fe094 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -212,7 +212,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (async && x->repl->check(x, skb, seq)) { + if (async && x->repl->recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5a5165a5927..387848e90078 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -585,6 +585,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_pol_hold(policy); net->xfrm.policy_count[dir]++; atomic_inc(&flow_cache_genid); + rt_genid_bump(net); if (delpol) __xfrm_policy_unlink(delpol, dir); policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir); @@ -1357,6 +1358,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; + if (afinfo->init_dst) + afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); @@ -1761,7 +1764,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, if (!afinfo) { dst_release(dst_orig); - ret = ERR_PTR(-EINVAL); + return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 2f6d11d04a2b..3efb07d3eb27 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -420,6 +420,18 @@ err: return -EINVAL; } +static int xfrm_replay_recheck_esn(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + if (unlikely(XFRM_SKB_CB(skb)->seq.input.hi != + htonl(xfrm_replay_seqhi(x, net_seq)))) { + x->stats.replay_window++; + return -EINVAL; + } + + return xfrm_replay_check_esn(x, skb, net_seq); +} + static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) { unsigned int bitnr, nr, i; @@ -479,6 +491,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) static struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, + .recheck = xfrm_replay_check, .notify = xfrm_replay_notify, .overflow = xfrm_replay_overflow, }; @@ -486,6 +499,7 @@ static struct xfrm_replay xfrm_replay_legacy = { static struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, + .recheck = xfrm_replay_check_bmp, .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_bmp, }; @@ -493,6 +507,7 @@ static struct xfrm_replay xfrm_replay_bmp = { static struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, + .recheck = xfrm_replay_recheck_esn, .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_esn, }; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5b228f97d4b3..210be48d8ae3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -415,8 +415,17 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (x->lft.hard_add_expires_seconds) { long tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; - if (tmo <= 0) - goto expired; + if (tmo <= 0) { + if (x->xflags & XFRM_SOFT_EXPIRE) { + /* enter hard expire without soft expire first?! + * setting a new date could trigger this. + * workarbound: fix x->curflt.add_time by below: + */ + x->curlft.add_time = now - x->saved_tmo - 1; + tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; + } else + goto expired; + } if (tmo < next) next = tmo; } @@ -433,10 +442,14 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (x->lft.soft_add_expires_seconds) { long tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; - if (tmo <= 0) + if (tmo <= 0) { warn = 1; - else if (tmo < next) + x->xflags &= ~XFRM_SOFT_EXPIRE; + } else if (tmo < next) { next = tmo; + x->xflags |= XFRM_SOFT_EXPIRE; + x->saved_tmo = tmo; + } } if (x->lft.soft_use_expires_seconds) { long tmo = x->lft.soft_use_expires_seconds + @@ -1981,8 +1994,10 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay) goto error; x->outer_mode = xfrm_get_mode(x->props.mode, family); - if (x->outer_mode == NULL) + if (x->outer_mode == NULL) { + err = -EPROTONOSUPPORT; goto error; + } if (init_replay) { err = xfrm_init_replay(x); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e75d8e47f35c..289f4bf18ff0 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -123,9 +123,21 @@ static inline int verify_replay(struct xfrm_usersa_info *p, struct nlattr **attrs) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; + struct xfrm_replay_state_esn *rs; - if ((p->flags & XFRM_STATE_ESN) && !rt) - return -EINVAL; + if (p->flags & XFRM_STATE_ESN) { + if (!rt) + return -EINVAL; + + rs = nla_data(rt); + + if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) + return -EINVAL; + + if (nla_len(rt) < xfrm_replay_state_esn_len(rs) && + nla_len(rt) != sizeof(*rs)) + return -EINVAL; + } if (!rt) return 0; @@ -370,14 +382,15 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es struct nlattr *rp) { struct xfrm_replay_state_esn *up; + int ulen; if (!replay_esn || !rp) return 0; up = nla_data(rp); + ulen = xfrm_replay_state_esn_len(up); - if (xfrm_replay_state_esn_len(replay_esn) != - xfrm_replay_state_esn_len(up)) + if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) return -EINVAL; return 0; @@ -388,22 +401,28 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn struct nlattr *rta) { struct xfrm_replay_state_esn *p, *pp, *up; + int klen, ulen; if (!rta) return 0; up = nla_data(rta); + klen = xfrm_replay_state_esn_len(up); + ulen = nla_len(rta) >= klen ? klen : sizeof(*up); - p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + p = kzalloc(klen, GFP_KERNEL); if (!p) return -ENOMEM; - pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + pp = kzalloc(klen, GFP_KERNEL); if (!pp) { kfree(p); return -ENOMEM; } + memcpy(p, up, ulen); + memcpy(pp, up, ulen); + *replay_esn = p; *preplay_esn = pp; @@ -442,10 +461,11 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info * * somehow made shareable and move it to xfrm_state.c - JHS * */ -static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs) +static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, + int update_esn) { struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; - struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; + struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL; struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; @@ -555,7 +575,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; /* override default values from above */ - xfrm_update_ae_params(x, attrs); + xfrm_update_ae_params(x, attrs, 0); return x; @@ -689,6 +709,7 @@ out: static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { + memset(p, 0, sizeof(*p)); memcpy(&p->id, &x->id, sizeof(p->id)); memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); @@ -742,7 +763,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) return -EMSGSIZE; algo = nla_data(nla); - strcpy(algo->alg_name, auth->alg_name); + strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); algo->alg_key_len = auth->alg_key_len; @@ -878,6 +899,7 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, { struct xfrm_dump_info info; struct sk_buff *skb; + int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) @@ -888,9 +910,10 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, info.nlmsg_seq = seq; info.nlmsg_flags = 0; - if (dump_one_state(x, 0, &info)) { + err = dump_one_state(x, 0, &info); + if (err) { kfree_skb(skb); - return NULL; + return ERR_PTR(err); } return skb; @@ -1317,6 +1340,7 @@ static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) { + memset(p, 0, sizeof(*p)); memcpy(&p->sel, &xp->selector, sizeof(p->sel)); memcpy(&p->lft, &xp->lft, sizeof(p->lft)); memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); @@ -1421,6 +1445,7 @@ static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) struct xfrm_user_tmpl *up = &vec[i]; struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + memset(up, 0, sizeof(*up)); memcpy(&up->id, &kp->id, sizeof(up->id)); up->family = kp->encap_family; memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); @@ -1546,6 +1571,7 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, { struct xfrm_dump_info info; struct sk_buff *skb; + int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) @@ -1556,9 +1582,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, info.nlmsg_seq = seq; info.nlmsg_flags = 0; - if (dump_one_policy(xp, dir, 0, &info) < 0) { + err = dump_one_policy(xp, dir, 0, &info); + if (err) { kfree_skb(skb); - return NULL; + return ERR_PTR(err); } return skb; @@ -1822,7 +1849,7 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; spin_lock_bh(&x->lock); - xfrm_update_ae_params(x, attrs); + xfrm_update_ae_params(x, attrs, 1); spin_unlock_bh(&x->lock); c.event = nlh->nlmsg_type; |