diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-05 03:27:41 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-05 03:27:41 +0300 |
commit | d347da0deffa1d8f88f0d270eab040e4707c9916 (patch) | |
tree | e0911f2ef4d36a7b44f7a5379feabebbd37dcfc4 /net | |
parent | c6c88bbde4d8b2ffe9886b7130b2e23781d424e5 (diff) | |
parent | 74cb8798222bb7d1aecb0acb91e6eeedf5feb948 (diff) | |
download | linux-d347da0deffa1d8f88f0d270eab040e4707c9916.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
Diffstat (limited to 'net')
166 files changed, 5404 insertions, 2308 deletions
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 7982656b9c83..a5144e43aae1 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -63,7 +63,7 @@ #include <linux/atalk.h> struct datalink_proto *ddp_dl, *aarp_dl; -static struct proto_ops atalk_dgram_ops; +static const struct proto_ops atalk_dgram_ops; /**************************************************************************\ * * @@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr */ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - int rc = -EINVAL; + int rc = -ENOIOCTLCMD; struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; @@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = atif_ioctl(cmd, argp); rtnl_unlock(); break; - /* Physical layer ioctl calls */ - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCGIFFLAGS: - case SIOCSIFFLAGS: - case SIOCGIFTXQLEN: - case SIOCSIFTXQLEN: - case SIOCGIFMTU: - case SIOCGIFCONF: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCGIFCOUNT: - case SIOCGIFINDEX: - case SIOCGIFNAME: - rc = dev_ioctl(cmd, argp); - break; } return rc; @@ -1841,7 +1824,7 @@ static struct net_proto_family atalk_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { .family = PF_APPLETALK, .owner = THIS_MODULE, .release = atalk_release, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2684a92da22b..f2c541774dcd 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr, } -static struct proto_ops pvc_proto_ops = { +static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, diff --git a/net/atm/svc.c b/net/atm/svc.c index d7b266136bf6..3a180cfd7b48 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return error; } -static struct proto_ops svc_proto_ops = { +static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1b683f302657..e8753c7fcad1 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -54,7 +54,7 @@ HLIST_HEAD(ax25_list); DEFINE_SPINLOCK(ax25_list_lock); -static struct proto_ops ax25_proto_ops; +static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { @@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - res = dev_ioctl(cmd, argp); + res = -ENOIOCTLCMD; break; } release_sock(sk); @@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops ax25_proto_ops = { +static const struct proto_ops ax25_proto_ops = { .family = PF_AX25, .owner = THIS_MODULE, .release = ax25_release, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ea616e3fc98e..fb031fe9be9e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) timeo = schedule_timeout(timeo); lock_sock(sk); - if (sk->sk_err) { - err = sock_error(sk); + err = sock_error(sk); + if (err) break; - } } set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 9778c6acd53b..ccbaf69afc5b 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } -static struct proto_ops bnep_sock_ops = { +static const struct proto_ops bnep_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = bnep_sock_release, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index beb045bf5714..5e22343b6090 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops cmtp_sock_ops = { +static const struct proto_ops cmtp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = cmtp_sock_release, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d6d0a15c099..84e6c93a044a 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char return 0; } -static struct proto_ops hci_sock_ops = { +static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index f8986f881431..8f8dd931b294 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops hidp_sock_ops = { +static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index e3bb11ca4235..7f0781e4326f 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -57,7 +57,7 @@ #define VERSION "2.8" -static struct proto_ops l2cap_sock_ops; +static const struct proto_ops l2cap_sock_ops; static struct bt_sock_list l2cap_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -2160,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); -static struct proto_ops l2cap_sock_ops = { +static const struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = l2cap_sock_release, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 6c34261b232e..757d2dd3b02f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -58,7 +58,7 @@ #define BT_DBG(D...) #endif -static struct proto_ops rfcomm_sock_ops; +static const struct proto_ops rfcomm_sock_ops; static struct bt_sock_list rfcomm_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); -static struct proto_ops rfcomm_sock_ops = { +static const struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = rfcomm_sock_release, diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9cb00dc6c08c..6b61323ce23c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -56,7 +56,7 @@ #define VERSION "0.5" -static struct proto_ops sco_sock_ops; +static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); -static struct proto_ops sco_sock_ops = { +static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, diff --git a/net/bridge/br.c b/net/bridge/br.c index f8f184942aaf..188cc1ac49eb 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook); module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); +MODULE_VERSION(BR_VERSION); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f564ee99782d..0b33a7b3a00c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -15,7 +15,9 @@ #include <linux/kernel.h> #include <linux/netdevice.h> -#include <linux/module.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> + #include <asm/uaccess.h> #include "br_private.h" @@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) return 0; } +/* Allow setting mac address of pseudo-bridge to be same as + * any of the bound interfaces + */ +static int br_set_mac_address(struct net_device *dev, void *p) +{ + struct net_bridge *br = netdev_priv(dev); + struct sockaddr *addr = p; + struct net_bridge_port *port; + int err = -EADDRNOTAVAIL; + + spin_lock_bh(&br->lock); + list_for_each_entry(port, &br->port_list, list) { + if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) { + br_stp_change_bridge_id(br, addr->sa_data); + err = 0; + break; + } + } + spin_unlock_bh(&br->lock); + + return err; +} + +static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "bridge"); + strcpy(info->version, BR_VERSION); + strcpy(info->fw_version, "N/A"); + strcpy(info->bus_info, "N/A"); +} + +static int br_set_sg(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_SG; + else + br->feature_mask &= ~NETIF_F_SG; + + br_features_recompute(br); + return 0; +} + +static int br_set_tso(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_TSO; + else + br->feature_mask &= ~NETIF_F_TSO; + + br_features_recompute(br); + return 0; +} + +static int br_set_tx_csum(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_IP_CSUM; + else + br->feature_mask &= ~NETIF_F_IP_CSUM; + + br_features_recompute(br); + return 0; +} + +static struct ethtool_ops br_ethtool_ops = { + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .set_sg = br_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = br_set_tx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = br_set_tso, +}; + void br_dev_setup(struct net_device *dev) { memset(dev->dev_addr, 0, ETH_ALEN); @@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev) dev->change_mtu = br_change_mtu; dev->destructor = free_netdev; SET_MODULE_OWNER(dev); + SET_ETHTOOL_OPS(dev, &br_ethtool_ops); dev->stop = br_dev_stop; dev->tx_queue_len = 0; - dev->set_mac_address = NULL; + dev->set_mac_address = br_set_mac_address; dev->priv_flags = IFF_EBRIDGE; + + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST + | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 975abe254b7a..11321197338e 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -32,9 +32,8 @@ * ethtool, use ethtool_ops. Also, since driver might sleep need to * not be holding any locks. */ -static int br_initial_port_cost(struct net_device *dev) +static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; struct ifreq ifr; mm_segment_t old_fs; @@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev) return 2; case SPEED_10: return 100; - default: - pr_info("bridge: can't decode speed from %s: %d\n", - dev->name, ecmd.speed); - return 100; } } @@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev) return 100; /* assume old 10Mbps */ } + +/* + * Check for port carrier transistions. + * Called from work queue to allow for calling functions that + * might sleep (such as speed check), and to debounce. + */ +static void port_carrier_check(void *arg) +{ + struct net_bridge_port *p = arg; + + rtnl_lock(); + if (netif_carrier_ok(p->dev)) { + u32 cost = port_cost(p->dev); + + spin_lock_bh(&p->br->lock); + if (p->state == BR_STATE_DISABLED) { + p->path_cost = cost; + br_stp_enable_port(p); + } + spin_unlock_bh(&p->br->lock); + } else { + spin_lock_bh(&p->br->lock); + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + spin_unlock_bh(&p->br->lock); + } + rtnl_unlock(); +} + static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; @@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p) dev->br_port = NULL; dev_set_promiscuity(dev, -1); + cancel_delayed_work(&p->carrier_check); + flush_scheduled_work(); + spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); @@ -155,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name) br->bridge_id.prio[1] = 0x00; memset(br->bridge_id.addr, 0, ETH_ALEN); + br->feature_mask = dev->features; br->stp_enabled = 0; br->designated_root = br->bridge_id; br->root_path_cost = 0; @@ -195,10 +223,9 @@ static int find_portno(struct net_bridge *br) return (index >= BR_MAX_PORTS) ? -EXFULL : index; } -/* called with RTNL */ +/* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, - struct net_device *dev, - unsigned long cost) + struct net_device *dev) { int index; struct net_bridge_port *p; @@ -215,12 +242,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->br = br; dev_hold(dev); p->dev = dev; - p->path_cost = cost; + p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; dev->br_port = p; p->port_no = index; br_init_port(p); p->state = BR_STATE_DISABLED; + INIT_WORK(&p->carrier_check, port_carrier_check, p); kobject_init(&p->kobj); return p; @@ -322,9 +350,8 @@ void br_features_recompute(struct net_bridge *br) struct net_bridge_port *p; unsigned long features, checksum; - features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_HIGHDMA | NETIF_F_TSO; - checksum = NETIF_F_IP_CSUM; /* least commmon subset */ + features = br->feature_mask &~ NETIF_F_IP_CSUM; + checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { if (!(p->dev->features @@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (dev->br_port != NULL) return -EBUSY; - if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) + if (IS_ERR(p = new_nbp(br, dev))) return PTR_ERR(p); if ((err = br_fdb_insert(br, p, dev->dev_addr))) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b88220a64cd8..c387852f753a 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + if (p->state == BR_STATE_LEARNING) { + kfree_skb(skb); + goto out; + } + if (br->dev->flags & IFF_PROMISC) { struct sk_buff *skb2; @@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto err; - if (p->state == BR_STATE_LEARNING) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source); - if (p->br->stp_enabled && !memcmp(dest, bridge_ula, 5) && !(dest[5] & 0xF0)) { @@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) NULL, br_stp_handle_bpdu); return 1; } + goto err; } - else if (p->state == BR_STATE_FORWARDING) { + if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) { if (br_should_route_hook) { if (br_should_route_hook(pskb)) return 0; diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 23422bd53a5e..223f8270daee 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -26,6 +26,7 @@ #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/netfilter_bridge.h> @@ -33,8 +34,11 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> + #include <net/ip.h> #include <net/ipv6.h> +#include <net/route.h> + #include <asm/uaccess.h> #include <asm/checksum.h> #include "br_private.h" diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 917311c6828b..a43a9c1d50d7 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v br_stp_recalculate_bridge_id(br); break; - case NETDEV_CHANGE: /* device is up but carrier changed */ - if (!(br->dev->flags & IFF_UP)) - break; - - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } + case NETDEV_CHANGE: + if (br->dev->flags & IFF_UP) + schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE); break; case NETDEV_FEAT_CHANGE: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index bdf95a74d8cd..c5bd631ffcd5 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -27,6 +27,10 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_PORT_DEBOUNCE (HZ/10) + +#define BR_VERSION "2.1" + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -78,6 +82,7 @@ struct net_bridge_port struct timer_list hold_timer; struct timer_list message_age_timer; struct kobject kobj; + struct work_struct carrier_check; struct rcu_head rcu; }; @@ -90,6 +95,7 @@ struct net_bridge spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; struct list_head age_list; + unsigned long feature_mask; /* STP */ bridge_id designated_root; @@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); extern void br_stp_set_port_priority(struct net_bridge_port *p, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index ac09b6a23523..cc047f7fb6ef 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p) } /* called under bridge lock */ -static void br_stp_change_bridge_id(struct net_bridge *br, - const unsigned char *addr) +void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { unsigned char oldaddr[6]; struct net_bridge_port *p; @@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || - compare_ether_addr(p->dev->dev_addr, addr) < 0) + memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c70b3be23026..b84fc6075fe1 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. config BRIDGE_EBT_ULOG - tristate "ebt: ulog support" + tristate "ebt: ulog support (OBSOLETE)" depends on BRIDGE_NF_EBTABLES help + This option enables the old bridge-specific "ebt_ulog" implementation + which has been obsoleted by the new "nfnetlink_log" code (see + CONFIG_NETFILTER_NETLINK_LOG). + This option adds the ulog watcher, that you can use in any rule in any ebtables table. The packet is passed to a userspace logging daemon using netlink multicast sockets. This differs diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 662975be3d1d..9f6e0193ae10 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -3,13 +3,16 @@ * * Authors: * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> * * April, 2002 * */ +#include <linux/in.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_log.h> +#include <linux/netfilter.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/if_arp.h> @@ -55,27 +58,30 @@ static void print_MAC(unsigned char *p) } #define myNIPQUAD(a) a[0], a[1], a[2], a[3] -static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, - const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) +static void +ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) { - struct ebt_log_info *info = (struct ebt_log_info *)data; - char level_string[4] = "< >"; + unsigned int bitmask; - level_string[1] = '0' + info->loglevel; spin_lock_bh(&ebt_log_lock); - printk(level_string); - printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", - out ? out->name : ""); + printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); - printk("MAC source = "); print_MAC(eth_hdr(skb)->h_source); printk("MAC dest = "); print_MAC(eth_hdr(skb)->h_dest); printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); - if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == + if (loginfo->type == NF_LOG_TYPE_LOG) + bitmask = loginfo->u.log.logflags; + else + bitmask = NF_LOG_MASK; + + if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)){ struct iphdr _iph, *ih; @@ -84,10 +90,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, printk(" INCOMPLETE IP header"); goto out; } - printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", - NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); - printk(" IP tos=0x%02X, IP proto=%d", ih->tos, - ih->protocol); + printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP " + "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), ih->tos, ih->protocol); if (ih->protocol == IPPROTO_TCP || ih->protocol == IPPROTO_UDP) { struct tcpudphdr _ports, *pptr; @@ -104,7 +109,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, goto out; } - if ((info->bitmask & EBT_LOG_ARP) && + if ((bitmask & EBT_LOG_ARP) && ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { struct arphdr _arph, *ah; @@ -144,6 +149,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, out: printk("\n"); spin_unlock_bh(&ebt_log_lock); + +} + +static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_log_info *info = (struct ebt_log_info *)data; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = info->loglevel; + li.u.log.logflags = info->bitmask; + + nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix); } static struct ebt_watcher log = @@ -154,13 +174,32 @@ static struct ebt_watcher log = .me = THIS_MODULE, }; +static struct nf_logger ebt_log_logger = { + .name = "ebt_log", + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { - return ebt_register_watcher(&log); + int ret; + + ret = ebt_register_watcher(&log); + if (ret < 0) + return ret; + if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) { + printk(KERN_WARNING "ebt_log: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + + return 0; } static void __exit fini(void) { + nf_log_unregister_logger(&ebt_log_logger); ebt_unregister_watcher(&log); } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index aae26ae2e61f..ce617b3dbbb8 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -3,6 +3,7 @@ * * Authors: * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> * * November, 2004 * @@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, +static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) + const struct ebt_ulog_info *uloginfo, const char *prefix) { ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; - struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ulog_buffers[group]; spinlock_t *lock = &ub->lock; @@ -216,6 +216,39 @@ alloc_failure: goto unlock; } +/* this function is registered with the netfilter core */ +static void ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *li, + const char *prefix) +{ + struct ebt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; + loginfo.cprange = 0; + loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nlgroup = li->u.ulog.group; + loginfo.cprange = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } + + ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; + + ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL); +} + + static int ebt_ulog_check(const char *tablename, unsigned int hookmask, const struct ebt_entry *e, void *data, unsigned int datalen) { @@ -240,6 +273,12 @@ static struct ebt_watcher ulog = { .me = THIS_MODULE, }; +static struct nf_logger ebt_ulog_logger = { + .name = EBT_ULOG_WATCHER, + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { int i, ret = 0; @@ -265,6 +304,13 @@ static int __init init(void) else if ((ret = ebt_register_watcher(&ulog))) sock_release(ebtulognl->sk_socket); + if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) { + printk(KERN_WARNING "ebt_ulog: not logging via ulog " + "since somebody else already registered for PF_BRIDGE\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + return ret; } @@ -273,6 +319,7 @@ static void __exit fini(void) ebt_ulog_buff_t *ub; int i; + nf_log_unregister_logger(&ebt_ulog_logger); ebt_unregister_watcher(&ulog); for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { ub = &ulog_buffers[i]; diff --git a/net/core/datagram.c b/net/core/datagram.c index 1bcfef51ac58..f8d322e1ea92 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -47,6 +47,7 @@ #include <linux/rtnetlink.h> #include <linux/poll.h> #include <linux/highmem.h> +#include <linux/spinlock.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } /** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + */ + +void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + if (flags & MSG_PEEK) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + kfree_skb(skb); +} + +EXPORT_SYMBOL(skb_kill_datagram); + +/** * skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/core/dev.c b/net/core/dev.c index a5efc9ae010b..29ba109d3e54 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); EXPORT_SYMBOL(dev_open); EXPORT_SYMBOL(dev_queue_xmit); EXPORT_SYMBOL(dev_remove_pack); diff --git a/net/core/filter.c b/net/core/filter.c index 3a10e0bc90e8..8964d3445588 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() */ #include <linux/module.h> @@ -250,7 +251,7 @@ load_b: mem[fentry->k] = X; continue; default: - /* Invalid instruction counts as RET */ + WARN_ON(1); return 0; } @@ -283,8 +284,8 @@ load_b: * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain - * no references or jumps that are out of range, no illegal instructions - * and no backward jumps. It must end with a RET instruction + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. * * Returns 0 if the rule set is legal or a negative errno code if not. */ @@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen) for (pc = 0; pc < flen; pc++) { /* all jumps are forward as they are not signed */ ftest = &filter[pc]; - if (BPF_CLASS(ftest->code) == BPF_JMP) { - /* but they mustn't jump off the end */ - if (BPF_OP(ftest->code) == BPF_JA) { - /* - * Note, the large ftest->k might cause loops. - * Compare this with conditional jumps below, - * where offsets are limited. --ANK (981016) - */ - if (ftest->k >= (unsigned)(flen-pc-1)) - return -EINVAL; - } else { - /* for conditionals both must be safe */ - if (pc + ftest->jt +1 >= flen || - pc + ftest->jf +1 >= flen) - return -EINVAL; - } - } - /* check for division by zero -Kris Katterjohn 2005-10-30 */ - if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) - return -EINVAL; + /* Only allow valid instructions */ + switch (ftest->code) { + case BPF_ALU|BPF_ADD|BPF_K: + case BPF_ALU|BPF_ADD|BPF_X: + case BPF_ALU|BPF_SUB|BPF_K: + case BPF_ALU|BPF_SUB|BPF_X: + case BPF_ALU|BPF_MUL|BPF_K: + case BPF_ALU|BPF_MUL|BPF_X: + case BPF_ALU|BPF_DIV|BPF_X: + case BPF_ALU|BPF_AND|BPF_K: + case BPF_ALU|BPF_AND|BPF_X: + case BPF_ALU|BPF_OR|BPF_K: + case BPF_ALU|BPF_OR|BPF_X: + case BPF_ALU|BPF_LSH|BPF_K: + case BPF_ALU|BPF_LSH|BPF_X: + case BPF_ALU|BPF_RSH|BPF_K: + case BPF_ALU|BPF_RSH|BPF_X: + case BPF_ALU|BPF_NEG: + case BPF_LD|BPF_W|BPF_ABS: + case BPF_LD|BPF_H|BPF_ABS: + case BPF_LD|BPF_B|BPF_ABS: + case BPF_LD|BPF_W|BPF_LEN: + case BPF_LD|BPF_W|BPF_IND: + case BPF_LD|BPF_H|BPF_IND: + case BPF_LD|BPF_B|BPF_IND: + case BPF_LD|BPF_IMM: + case BPF_LDX|BPF_W|BPF_LEN: + case BPF_LDX|BPF_B|BPF_MSH: + case BPF_LDX|BPF_IMM: + case BPF_MISC|BPF_TAX: + case BPF_MISC|BPF_TXA: + case BPF_RET|BPF_K: + case BPF_RET|BPF_A: + break; + + /* Some instructions need special checks */ - /* check that memory operations use valid addresses. */ - if (ftest->k >= BPF_MEMWORDS) { - /* but it might not be a memory operation... */ - switch (ftest->code) { - case BPF_ST: - case BPF_STX: - case BPF_LD|BPF_MEM: - case BPF_LDX|BPF_MEM: + case BPF_ALU|BPF_DIV|BPF_K: + /* check for division by zero */ + if (ftest->k == 0) return -EINVAL; - } + break; + + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + case BPF_ST: + case BPF_STX: + /* check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + + case BPF_JMP|BPF_JA: + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + break; + + case BPF_JMP|BPF_JEQ|BPF_K: + case BPF_JMP|BPF_JEQ|BPF_X: + case BPF_JMP|BPF_JGE|BPF_K: + case BPF_JMP|BPF_JGE|BPF_X: + case BPF_JMP|BPF_JGT|BPF_K: + case BPF_JMP|BPF_JGT|BPF_X: + case BPF_JMP|BPF_JSET|BPF_K: + case BPF_JMP|BPF_JSET|BPF_X: + /* for conditionals both must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + + default: + return -EINVAL; } } diff --git a/net/core/flow.c b/net/core/flow.c index 7e95b39de9fd..c4f25385029f 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -23,6 +23,7 @@ #include <net/flow.h> #include <asm/atomic.h> #include <asm/semaphore.h> +#include <linux/security.h> struct flow_cache_entry { struct flow_cache_entry *next; @@ -30,6 +31,7 @@ struct flow_cache_entry { u8 dir; struct flowi key; u32 genid; + u32 sk_sid; void *object; atomic_t *object_ref; }; @@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, +void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, flow_resolve_t resolver) { struct flow_cache_entry *fle, **head; @@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && + fle->sk_sid == sk_sid && flow_key_compare(key, &fle->key) == 0) { if (fle->genid == atomic_read(&flow_cache_genid)) { void *ret = fle->object; @@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, *head = fle; fle->family = family; fle->dir = dir; + fle->sk_sid = sk_sid; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; flow_count(cpu)++; @@ -221,7 +225,7 @@ nocache: void *obj; atomic_t *obj_ref; - resolver(key, family, dir, &obj, &obj_ref); + resolver(key, sk_sid, family, dir, &obj, &obj_ref); if (fle) { fle->genid = atomic_read(&flow_cache_genid); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 49424a42a2c0..281a632fa6a6 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> +#include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7fc3e9e28c34..06cad2d63e8a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]); /* Module parameters, defaults. */ static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d = 0; -static int pg_clone_skb_d = 0; -static int debug = 0; +static int pg_delay_d; +static int pg_clone_skb_d; +static int debug; static DECLARE_MUTEX(pktgen_sem); static struct pktgen_thread *pktgen_threads = NULL; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83fee37de38e..070f91cfde59 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int fclone) { + struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; /* Get the HEAD */ - if (fclone) - skb = kmem_cache_alloc(skbuff_fclone_cache, - gfp_mask & ~__GFP_DMA); - else - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); - + skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache, + gfp_mask & ~__GFP_DMA); if (!skb) goto out; @@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb->tail = data; skb->end = data + size; + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); + shinfo->nr_frags = 0; + shinfo->tso_size = 0; + shinfo->tso_segs = 0; + shinfo->ufo_size = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; + if (fclone) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, child->fclone = SKB_FCLONE_UNAVAILABLE; } - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; - skb_shinfo(skb)->frag_list = NULL; - skb_shinfo(skb)->ufo_size = 0; - skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: diff --git a/net/core/sock.c b/net/core/sock.c index 13cc3be4f056..6465b0e4c8cb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab) } } - if (prot->twsk_obj_size) { + if (prot->twsk_prot != NULL) { static const char mask[] = "tw_sock_%s"; timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); @@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab; sprintf(timewait_sock_slab_name, mask, prot->name); - prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, - prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (prot->twsk_slab == NULL) + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } @@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } - if (prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_slab); + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_slab); + kmem_cache_destroy(prot->twsk_prot->twsk_slab); kfree(name); - prot->twsk_slab = NULL; + prot->twsk_prot->twsk_slab = NULL; } } diff --git a/net/core/stream.c b/net/core/stream.c index 15bfd03e8024..35e25259fd95 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) int done; do { - if (sk->sk_err) - return sock_error(sk); + int err = sock_error(sk); + if (err) + return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) @@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, + !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); @@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + sk_wait_event(sk, ¤t_timeo, !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 344a8da153fc..87b27fff6e3b 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,3 +1,7 @@ +obj-$(CONFIG_IPV6) += dccp_ipv6.o + +dccp_ipv6-y := ipv6.o + obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index c9a62cca22fc..ce9cb77c5c29 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) from = av->dccpav_buf + av->dccpav_buf_head; /* Check if buf_head wraps */ - if (av->dccpav_buf_head + len > av->dccpav_vec_len) { - const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head); + if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) { + const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head; memcpy(to, from, tailsize); to += tailsize; @@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len, const gfp_t priority) { - struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority); + struct dccp_ackvec *av; + BUG_ON(len == 0); + + if (len > DCCP_MAX_ACKVEC_LEN) + return NULL; + + av = kmalloc(sizeof(*av) + len, priority); if (av != NULL) { av->dccpav_buf_len = len; av->dccpav_buf_head = @@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av) } static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; } static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; } @@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, */ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, const unsigned int packets, - const unsigned char state) + const unsigned char state) { unsigned int gap; signed long new_head; @@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); - unsigned int index = av->dccpav_buf_head; + u8 index = av->dccpav_buf_head; while (1) { const u8 len = dccp_ackvec_len(av, index); @@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av) } #endif -static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) +static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av) { /* * As we're keeping track of the ack vector size (dccpav_vec_len) and @@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) * draft-ietf-dccp-spec-11.txt Appendix A. -acme */ #if 0 - av->dccpav_buf_tail = av->dccpav_ack_ptr + 1; - if (av->dccpav_buf_tail >= av->dccpav_vec_len) - av->dccpav_buf_tail -= av->dccpav_vec_len; + u32 new_buf_tail = av->dccpav_ack_ptr + 1; + if (new_buf_tail >= av->dccpav_vec_len) + new_buf_tail -= av->dccpav_vec_len; + av->dccpav_buf_tail = new_buf_tail; #endif av->dccpav_vec_len -= av->dccpav_sent_len; } @@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, debug_prefix, 1, (unsigned long long)av->dccpav_ack_seqno, (unsigned long long)av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1; } } @@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, av->dccpav_ack_seqno, (unsigned long long) av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); } /* * If dccpav_ack_seqno was not received, no problem diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index d0fd6c60c574..f7dfb5f67b87 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -54,16 +54,16 @@ * @dccpav_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - unsigned int dccpav_buf_head; - unsigned int dccpav_buf_tail; u64 dccpav_buf_ackno; u64 dccpav_ack_seqno; u64 dccpav_ack_ackno; - unsigned int dccpav_ack_ptr; - unsigned int dccpav_sent_len; - unsigned int dccpav_vec_len; - unsigned int dccpav_buf_len; struct timeval dccpav_time; + u8 dccpav_buf_head; + u8 dccpav_buf_tail; + u8 dccpav_ack_ptr; + u8 dccpav_sent_len; + u8 dccpav_vec_len; + u8 dccpav_buf_len; u8 dccpav_buf_nonce; u8 dccpav_ack_nonce; u8 dccpav_buf[0]; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c37eeeaf5c6e..de681c6ad081 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -21,6 +21,8 @@ #define CCID_MAX 255 +struct tcp_info; + struct ccid { unsigned char ccid_id; const char *ccid_name; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f97b85d55ad8..93f26dd6e6cb 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); #define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ -extern struct proto dccp_v4_prot; +extern struct proto dccp_prot; /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) @@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len); +extern int dccp_v4_init_sock(struct sock *sk); +extern int dccp_v4_destroy_sock(struct sock *sk); + extern void dccp_close(struct sock *sk, long timeout); extern struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, @@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk, extern int dccp_connect(struct sock *sk); extern int dccp_disconnect(struct sock *sk, int flags); +extern void dccp_unhash(struct sock *sk); extern int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); extern int dccp_setsockopt(struct sock *sk, int level, int optname, @@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void dccp_shutdown(struct sock *sk, int how); +extern int inet_dccp_listen(struct socket *sock, int backlog); +extern unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); +extern void dccp_v4_send_check(struct sock *sk, int len, + struct sk_buff *skb); +extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len); extern int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr); @@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb, extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); +extern int dccp_invalid_packet(struct sk_buff *skb); + +static inline int dccp_bad_service_code(const struct sock *sk, + const __u32 service) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_service == service) + return 0; + return !dccp_list_has_service(dp->dccps_service_list, service); +} struct dccp_skb_cb { __u8 dccpd_type:4; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index f675d8e642d3..3f78c00e3822 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = dp->dccps_pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; if (dp->dccps_options.dccpo_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; diff --git a/net/dccp/input.c b/net/dccp/input.c index 3454d5941900..b6cba72b44e8 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) return 0; } -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned len) +static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, skb)) - goto discard; - - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_options.dccpo_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto discard; - - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - switch (dccp_hdr(skb)->dccph_type) { case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: @@ -250,6 +233,37 @@ discard: return 0; } +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + return __dccp_rcv_established(sk, skb, dh, len); +discard: + __kfree_skb(skb); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rcv_established); + static int dccp_rcv_request_sent_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, @@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto out_invalid_packet; /* FIXME: change error code */ + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_update_gsr(sk, dp->dccps_isr); /* @@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) @@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dccp_set_state(sk, DCCP_PARTOPEN); /* Make sure socket is routed, for correct metrics. */ - inet_sk_rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, if (dh->dccph_type == DCCP_PKT_DATAACK || dh->dccph_type == DCCP_PKT_DATA) { - dccp_rcv_established(sk, skb, dh, len); + __dccp_rcv_established(sk, skb, dh, len); queued = 1; /* packet was queued - (by dccp_rcv_established) */ + (by __dccp_rcv_established) */ } break; } @@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (dccp_v4_conn_request(sk, skb) < 0) + if (inet_csk(sk)->icsk_af_ops->conn_request(sk, + skb) < 0) return 1; /* FIXME: do congestion control initialization */ @@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - if (dp->dccps_options.dccpo_send_ack_vector && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); } /* @@ -566,3 +587,5 @@ discard: } return 0; } + +EXPORT_SYMBOL_GPL(dccp_rcv_state_process); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 656e13e38cfb..3f244670764a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -19,7 +19,9 @@ #include <net/icmp.h> #include <net/inet_hashtables.h> +#include <net/inet_sock.h> #include <net/sock.h> +#include <net/timewait_sock.h> #include <net/tcp_states.h> #include <net/xfrm.h> @@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) { - return inet_csk_get_port(&dccp_hashinfo, sk, snum); + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void dccp_v4_hash(struct sock *sk) @@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk) inet_hash(&dccp_hashinfo, sk); } -static void dccp_v4_unhash(struct sock *sk) +void dccp_unhash(struct sock *sk) { inet_unhash(&dccp_hashinfo, sk); } -/* called with local bh disabled */ -static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const u32 daddr = inet->rcv_saddr; - const u32 saddr = inet->daddr; - const int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - const struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } +EXPORT_SYMBOL_GPL(dccp_unhash); - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp != NULL) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -/* - * Bind a port for a connect operation and hash it. - */ -static int dccp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - do { - head = &dccp_hashinfo.bhash[inet_bhashfn(rover, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == rover) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v4_check_established(sk, - rover, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, rover); - if (tb == NULL) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - /* All locks still held and bhs disabled */ - inet_bind_hash(sk, tb, rover); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(rover); - __inet_hash(&dccp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw != NULL) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { - __inet_hash(&dccp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - -static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) +int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); @@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin_port; inet->daddr = daddr; - dp->dccps_ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt != NULL) - dp->dccps_ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; /* * Socket identity is still unknown (sport may be zero). * However we set state to DCCP_REQUESTING and not releasing socket @@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, * complete initialization after this. */ dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v4_hash_connect(sk); + err = inet_hash_connect(&dccp_death_row, sk); if (err != 0) goto failure; @@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, usin->sin_port); dccp_update_gss(sk, dp->dccps_iss); - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - */ - dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); - inet->id = dp->dccps_iss ^ jiffies; err = dccp_connect(sk); @@ -316,6 +152,8 @@ failure: goto out; } +EXPORT_SYMBOL_GPL(dccp_v4_connect); + /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - dp->dccps_pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); /* @@ -606,6 +444,17 @@ out: sock_put(sk); } +/* This routine computes an IPv4 DCCP checksum. */ +void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + const struct inet_sock *inet = inet_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr); +} + +EXPORT_SYMBOL_GPL(dccp_v4_send_check); + int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; @@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk, dccp_hdr(skb)->dccph_sport); } -static inline int dccp_bad_service_code(const struct sock *sk, - const __u32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return 0; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) const __u32 service = dccp_hdr_request(skb)->dccph_req_service; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; - struct dst_entry *dst = NULL; /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ if (((struct rtable *)skb->dst)->rt_flags & @@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; - /* FIXME: Merge Aristeu's option parsing code when ready */ req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ ireq->opt = NULL; @@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); dreq->dreq_service = service; - if (dccp_v4_send_response(sk, req, dst)) + if (dccp_v4_send_response(sk, req, NULL)) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: - /* - * FIXME: should be reqsk_free after implementing req->rsk_ops - */ - __reqsk_free(req); + reqsk_free(req); drop: DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); dcb->dccpd_reset_code = reset_code; return -1; } +EXPORT_SYMBOL_GPL(dccp_v4_conn_request); + /* * The three way handshake has completed - we got a valid ACK or DATAACK - * now create the new socket. @@ -792,6 +628,8 @@ exit: return NULL; } +EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); + static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); @@ -1011,7 +849,9 @@ discard: return 0; } -static inline int dccp_invalid_packet(struct sk_buff *skb) +EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); + +int dccp_invalid_packet(struct sk_buff *skb) { const struct dccp_hdr *dh; @@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) return 1; } - /* If the header checksum is incorrect, drop packet and return */ - if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, - skb->nh.iph->daddr) < 0) { - LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is " - "incorrect\n"); - return 1; - } - return 0; } +EXPORT_SYMBOL_GPL(dccp_invalid_packet); + /* this is called when real data arrives */ int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; struct sock *sk; - int rc; /* Step 1: Check header basics: */ if (dccp_invalid_packet(skb)) goto discard_it; + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { + LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n", + __FUNCTION__); + goto discard_it; + } + dh = dccp_hdr(skb); DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); @@ -1143,28 +984,10 @@ int dccp_v4_rcv(struct sk_buff *skb) goto do_time_wait; } - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - dccp_pr_debug("xfrm4_policy_check failed\n"); + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - } - - if (sk_filter(sk, skb, 0)) { - dccp_pr_debug("sk_filter failed\n"); - goto discard_and_relse; - } - - skb->dev = NULL; - - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v4_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - return rc; + return sk_receive_skb(sk, skb); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -1194,9 +1017,23 @@ do_time_wait: goto no_dccp_socket; } -static int dccp_v4_init_sock(struct sock *sk) +struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v4_conn_request, + .syn_recv_sock = dccp_v4_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +}; + +int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); static int dccp_ctl_socket_init = 1; dccp_options_init(&dp->dccps_options); @@ -1236,9 +1073,11 @@ static int dccp_v4_init_sock(struct sock *sk) dccp_ctl_socket_init = 0; dccp_init_xmit_timers(sk); - inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + icsk->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; @@ -1246,7 +1085,9 @@ static int dccp_v4_init_sock(struct sock *sk) return 0; } -static int dccp_v4_destroy_sock(struct sock *sk) +EXPORT_SYMBOL_GPL(dccp_v4_init_sock); + +int dccp_v4_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); @@ -1279,6 +1120,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock); + static void dccp_v4_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->opt); @@ -1293,7 +1136,11 @@ static struct request_sock_ops dccp_request_sock_ops = { .send_reset = dccp_v4_ctl_send_reset, }; -struct proto dccp_v4_prot = { +static struct timewait_sock_ops dccp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; + +struct proto dccp_prot = { .name = "DCCP", .owner = THIS_MODULE, .close = dccp_close, @@ -1307,7 +1154,7 @@ struct proto dccp_v4_prot = { .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v4_do_rcv, .hash = dccp_v4_hash, - .unhash = dccp_v4_unhash, + .unhash = dccp_unhash, .accept = inet_csk_accept, .get_port = dccp_v4_get_port, .shutdown = dccp_shutdown, @@ -1316,5 +1163,7 @@ struct proto dccp_v4_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .rsk_prot = &dccp_request_sock_ops, - .twsk_obj_size = sizeof(struct inet_timewait_sock), + .twsk_prot = &dccp_timewait_sock_ops, }; + +EXPORT_SYMBOL_GPL(dccp_prot); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c new file mode 100644 index 000000000000..c609dc78f487 --- /dev/null +++ b/net/dccp/ipv6.c @@ -0,0 +1,1261 @@ +/* + * DCCP over IPv6 + * Linux INET6 implementation + * + * Based on net/dccp6/ipv6.c + * + * Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/xfrm.h> + +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/inet_sock.h> +#include <net/inet6_connection_sock.h> +#include <net/inet6_hashtables.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/xfrm.h> + +#include "dccp.h" +#include "ipv6.h" + +static void dccp_v6_ctl_send_reset(struct sk_buff *skb); +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req); +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); + +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); + +static struct inet_connection_sock_af_ops dccp_ipv6_mapped; +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops; + +static int dccp_v6_get_port(struct sock *sk, unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet6_csk_bind_conflict); +} + +static void dccp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != DCCP_CLOSED) { + if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { + dccp_prot.hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(&dccp_hashinfo, sk); + local_bh_enable(); + } +} + +static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base); +} + +static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + + if (skb->protocol == htons(ETH_P_IPV6)) + return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, + skb->nh.ipv6h->saddr.s6_addr32, + dh->dccph_dport, + dh->dccph_sport); + else + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dh->dccph_dport, + dh->dccph_sport); +} + +static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct flowi fl; + struct dst_entry *dst; + int addr_type; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl, 0, sizeof(fl)); + + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl.fl6_flowlabel); + if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if (ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl.fl6_flowlabel; + + /* + * DCCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = icsk->icsk_ext_hdr_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + icsk->icsk_af_ops = &dccp_ipv6_mapped; + sk->sk_backlog_rcv = dccp_v4_do_rcv; + + err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_backlog_rcv = dccp_v6_do_rcv; + goto failure; + } else { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), + inet->saddr); + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), + inet->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto failure; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto failure; + + if (saddr == NULL) { + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->rcv_saddr = LOOPBACK4_IPV6; + + ip6_dst_store(sk, dst, NULL); + + icsk->icsk_ext_hdr_len = 0; + if (np->opt) + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); + + inet->dport = usin->sin6_port; + + dccp_set_state(sk, DCCP_REQUESTING); + err = inet6_hash_connect(&dccp_death_row, sk); + if (err) + goto late_failure; + /* FIXME */ +#if 0 + dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->sport, + inet->dport); +#endif + err = dccp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + dccp_set_state(sk, DCCP_CLOSED); + __sk_dst_reset(sk); +failure: + inet->dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + __u64 seq; + + sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport, + &hdr->saddr, dh->dccph_sport, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi fl; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->sk_err_soft = -err; + goto out; + } + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + goto out; + } + + } else + dst_hold(dst); + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + dccp_sync_mss(sk, dst_mtu(dst)); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + seq = DCCP_SKB_CB(skb)->dccpd_seq; + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, dh->dccph_dport, + &hdr->daddr, &hdr->saddr, + inet6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + /* + * Wake people up to see the error + * (see connect in sock.c) + */ + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr *final_p = NULL, final; + struct flowi fl; + int err = -1; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.fl6_flowlabel = 0; + fl.oif = ireq6->iif; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (dst == NULL) { + opt = np->opt; + if (opt == NULL && + np->rxopt.bits.osrcrt == 2 && + ireq6->pktopts) { + struct sk_buff *pktopts = ireq6->pktopts; + struct inet6_skb_parm *rxopt = IP6CB(pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(pktopts->nh.raw + + rxopt->srcrt)); + } + + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto done; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto done; + } + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + struct dccp_hdr *dh = dccp_hdr(skb); + dh->dccph_checksum = dccp_v6_check(dh, skb->len, + &ireq6->loc_addr, + &ireq6->rmt_addr, + csum_partial((char *)dh, + skb->len, + skb->csum)); + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + err = ip6_xmit(sk, skb, &fl, opt, 0); + if (err == NET_XMIT_CN) + err = 0; + } + +done: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return err; +} + +static void dccp_v6_reqsk_destructor(struct request_sock *req) +{ + if (inet6_rsk(req)->pktopts != NULL) + kfree_skb(inet6_rsk(req)->pktopts); +} + +static struct request_sock_ops dccp6_request_sock_ops = { + .family = AF_INET6, + .obj_size = sizeof(struct dccp6_request_sock), + .rtx_syn_ack = dccp_v6_send_response, + .send_ack = dccp_v6_reqsk_send_ack, + .destructor = dccp_v6_reqsk_destructor, + .send_reset = dccp_v6_ctl_send_reset, +}; + +static struct timewait_sock_ops dccp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr, + len, IPPROTO_DCCP, + csum_partial((char *)dh, + dh->dccph_doff << 2, + skb->csum)); +} + +static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb) +{ + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct flowi fl; + u64 seqno; + + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (!ipv6_unicast_destination(rxskb)) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Swap the send and the receive. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = + DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */ + seqno = 0; + if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1); + + dccp_hdr_set_seq(dh, seqno); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + sizeof(*dh), IPPROTO_DCCP, + skb->csum); + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + /* sk = NULL, but it is safe for now. RST socket required. */ + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb) +{ + struct flowi fl; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + + /* FIXME: calculate checksum, IPv4 also should... */ + + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req) +{ + dccp_v6_ctl_send_ack(skb); +} + +static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct ipv6hdr *iph = skb->nh.ipv6h; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet6_csk_search_req(sk, &prev, + dh->dccph_sport, + &iph->saddr, + &iph->daddr, + inet6_iif(skb)); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(&dccp_hashinfo, + &iph->saddr, dh->dccph_sport, + &iph->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + struct inet6_request_sock *ireq6; + struct ipv6_pinfo *np = inet6_sk(sk); + const __u32 service = dccp_hdr_request(skb)->dccph_req_service; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + if (dccp_bad_service_code(sk, service)) { + reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; + goto drop; + } + /* + * There are no SYN attacks on IPv6, yet... + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq6 = inet6_rsk(req); + ireq = inet_rsk(req); + ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); + req->rcv_wnd = 100; /* Fake, option parsing will get the + right value */ + ireq6->pktopts = NULL; + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + ireq6->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = dcb->dccpd_seq; + dreq->dreq_iss = dccp_v6_init_sequence(sk, skb); + dreq->dreq_service = service; + + if (dccp_v6_send_response(sk, req, NULL)) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + dcb->dccpd_reset_code = reset_code; + return -1; +} + +static struct sock *dccp_v6_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct dccp6_sock *newdp6; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + if (newsk == NULL) + return NULL; + + newdp6 = (struct dccp6_sock *)newsk; + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + newinet->daddr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + newinet->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; + newsk->sk_backlog_rcv = dccp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, dccp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (np->rxopt.bits.osrcrt == 2 && + opt == NULL && ireq6->pktopts) { + struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw + + rxopt->srcrt)); + } + + if (dst == NULL) { + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out; + } + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, dccp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newdp = dccp_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr); + ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr); + newsk->sk_bound_dev_if = ireq6->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (ireq6->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC); + kfree_skb(ireq6->pktopts); + ireq6->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (newnp->opt) + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); + + dccp_sync_mss(newsk, dst_mtu(dst)); + + newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + + __inet6_hash(&dccp_hashinfo, newsk); + inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +out: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return NULL; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, dccp_rcv_established and rcv_established + handle them correctly, but it is not case with + dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb, 0)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + } + + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + +reset: + dccp_v6_ctl_send_reset(skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +} + +static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + const struct dccp_hdr *dh; + struct sk_buff *skb = *pskb; + struct sock *sk; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); + + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + if (dccp_packet_without_ack(skb)) + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + else + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr, + dh->dccph_sport, + &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) + goto no_dccp_socket; + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + return sk_receive_skb(sk, skb) ? -1 : 0; + +no_dccp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v6_ctl_send_reset(skb); + } +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + inet_twsk_put((struct inet_timewait_sock *)sk); + goto no_dccp_socket; +} + +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { + .queue_xmit = inet6_csk_xmit, + .send_check = dccp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* + * DCCP over IPv4 via INET6 API + */ +static struct inet_connection_sock_af_ops dccp_ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int dccp_v6_init_sock(struct sock *sk) +{ + int err = dccp_v4_init_sock(sk); + + if (err == 0) + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + + return err; +} + +static int dccp_v6_destroy_sock(struct sock *sk) +{ + dccp_v4_destroy_sock(sk); + return inet6_destroy_sock(sk); +} + +static struct proto dccp_v6_prot = { + .name = "DCCPv6", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v6_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v6_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v6_do_rcv, + .hash = dccp_v6_hash, + .unhash = dccp_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v6_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v6_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp6_sock), + .rsk_prot = &dccp6_request_sock_ops, + .twsk_prot = &dccp6_timewait_sock_ops, +}; + +static struct inet6_protocol dccp_v6_protocol = { + .handler = dccp_v6_rcv, + .err_handler = dccp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static struct proto_ops inet6_dccp_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = dccp_poll, + .ioctl = inet6_ioctl, + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct inet_protosw dccp_v6_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v6_prot, + .ops = &inet6_dccp_ops, + .capability = -1, + .flags = INET_PROTOSW_ICSK, +}; + +static int __init dccp_v6_init(void) +{ + int err = proto_register(&dccp_v6_prot, 1); + + if (err != 0) + goto out; + + err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + if (err != 0) + goto out_unregister_proto; + + inet6_register_protosw(&dccp_v6_protosw); +out: + return err; +out_unregister_proto: + proto_unregister(&dccp_v6_prot); + goto out; +} + +static void __exit dccp_v6_exit(void) +{ + inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + inet6_unregister_protosw(&dccp_v6_protosw); + proto_unregister(&dccp_v6_prot); +} + +module_init(dccp_v6_init); +module_exit(dccp_v6_exit); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); +MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h new file mode 100644 index 000000000000..e4d4e9309270 --- /dev/null +++ b/net/dccp/ipv6.h @@ -0,0 +1,37 @@ +#ifndef _DCCP_IPV6_H +#define _DCCP_IPV6_H +/* + * net/dccp/ipv6.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/ipv6.h> + +struct dccp6_sock { + struct dccp_sock dccp; + /* + * ipv6_pinfo has to be the last member of dccp6_sock, + * see inet6_sk_generic. + */ + struct ipv6_pinfo inet6; +}; + +struct dccp6_request_sock { + struct dccp_request_sock dccp; + struct inet6_request_sock inet6; +}; + +struct dccp6_timewait_sock { + struct inet_timewait_sock inet; + struct inet6_timewait_sock tw6; +}; + +#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 1393461898bb..29261fc198e7 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = { (unsigned long)&dccp_death_row), }; +EXPORT_SYMBOL_GPL(dccp_death_row); + void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; + + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } +#endif /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); @@ -170,6 +183,8 @@ out_free: return newsk; } +EXPORT_SYMBOL_GPL(dccp_create_openreq_child); + /* * Process an incoming packet for RESPOND sockets represented * as an request_sock. @@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, goto drop; } - child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; @@ -236,6 +251,8 @@ drop: goto out; } +EXPORT_SYMBOL_GPL(dccp_check_req); + /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with @@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child, sock_put(child); return ret; } + +EXPORT_SYMBOL_GPL(dccp_child_process); diff --git a/net/dccp/output.c b/net/dccp/output.c index 74ff87025878..efd7ffb903a1 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> +#include <net/inet_sock.h> #include <net/sock.h> #include "ackvec.h" @@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (likely(skb != NULL)) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); struct dccp_hdr *dh; @@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) break; } - dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, - inet->daddr); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (set_ack) dccp_event_ack_sent(sk); @@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - err = ip_queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (err <= 0) return err; @@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { + struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now; - - /* - * FIXME: we really should be using the af_specific thing to support - * IPv6. - * mss_now = pmtu - tp->af_specific->net_header_len - - * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); - */ - mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - - sizeof(struct dccp_hdr_ext); + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - + sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); /* Now subtract optional transport overhead */ - mss_now -= dp->dccps_ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ - dp->dccps_pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; dp->dccps_mss_cache = mss_now; return mss_now; } +EXPORT_SYMBOL_GPL(dccp_sync_mss); + void dccp_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); @@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo) int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - if (inet_sk_rebuild_header(sk) != 0) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) return -EHOSTUNREACH; /* Routing failure or similar. */ return dccp_transmit_skb(sk, (skb_cloned(skb) ? @@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, return skb; } +EXPORT_SYMBOL_GPL(dccp_make_response); + struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, const enum dccp_reset_codes code) @@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, */ static inline void dccp_connect_init(struct sock *sk) { + struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* - * FIXME: set dp->{dccps_swh,dccps_swl}, with - * something like dccp_inc_seq - */ + dccp_update_gss(sk, dp->dccps_iss); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); icsk->icsk_retransmits = 0; } @@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_connect); + void dccp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8a6b2a9e4581..65b11ea90d85 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,7 +24,7 @@ #include <net/checksum.h> #include <net/inet_common.h> -#include <net/ip.h> +#include <net/inet_sock.h> #include <net/protocol.h> #include <net/sock.h> #include <net/xfrm.h> @@ -34,15 +34,18 @@ #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> -#include <linux/dccp.h> #include "ccid.h" #include "dccp.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; +EXPORT_SYMBOL_GPL(dccp_statistics); + atomic_t dccp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(dccp_orphan_count); + static struct net_protocol dccp_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, @@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags) return err; } +EXPORT_SYMBOL_GPL(dccp_disconnect); + /* * Wait for a DCCP event. * @@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -static unsigned int dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) +unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; @@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL_GPL(dccp_poll); + int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { dccp_pr_debug("entry\n"); return -ENOIOCTLCMD; } +EXPORT_SYMBOL_GPL(dccp_ioctl); + static int dccp_setsockopt_service(struct sock *sk, const u32 service, char __user *optval, int optlen) { @@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, int val; if (level != SOL_DCCP) - return ip_setsockopt(sk, level, optname, optval, optlen); + return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, + optname, optval, + optlen); if (optlen < sizeof(int)) return -EINVAL; @@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL_GPL(dccp_setsockopt); + static int dccp_getsockopt_service(struct sock *sk, int len, u32 __user *optval, int __user *optlen) @@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, int val, len; if (level != SOL_DCCP) - return ip_getsockopt(sk, level, optname, optval, optlen); - + return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, + optname, optval, + optlen); if (get_user(len, optlen)) return -EFAULT; @@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, return 0; } +EXPORT_SYMBOL_GPL(dccp_getsockopt); + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -410,6 +426,8 @@ out_discard: goto out_release; } +EXPORT_SYMBOL_GPL(dccp_sendmsg); + int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -507,7 +525,9 @@ out: return len; } -static int inet_dccp_listen(struct socket *sock, int backlog) +EXPORT_SYMBOL_GPL(dccp_recvmsg); + +int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; @@ -543,6 +563,8 @@ out: return err; } +EXPORT_SYMBOL_GPL(inet_dccp_listen); + static const unsigned char dccp_new_state[] = { /* current state: new state: action: */ [0] = DCCP_CLOSED, @@ -648,12 +670,16 @@ adjudge_to_death: sock_put(sk); } +EXPORT_SYMBOL_GPL(dccp_close); + void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("entry\n"); } -static struct proto_ops inet_dccp_ops = { +EXPORT_SYMBOL_GPL(dccp_shutdown); + +static const struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops; static struct inet_protosw dccp_v4_protosw = { .type = SOCK_DCCP, .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, + .prot = &dccp_prot, .ops = &inet_dccp_ops, .capability = -1, .no_check = 0, - .flags = 0, + .flags = INET_PROTOSW_ICSK, }; /* @@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); int dccp_debug; module_param(dccp_debug, int, 0444); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; int ehash_order, bhash_order, i; - int rc = proto_register(&dccp_v4_prot, 1); + int rc = proto_register(&dccp_prot, 1); if (rc) goto out; @@ -869,7 +897,7 @@ out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_hashinfo.bind_bucket_cachep = NULL; out_proto_unregister: - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); goto out; } @@ -892,7 +920,7 @@ static void __exit dccp_fini(void) get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); } module_init(dccp_init); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d402e9020c68..78ec5344be86 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk); #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) -static struct proto_ops dn_proto_ops; +static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; @@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } @@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops dn_proto_ops = { +static const struct proto_ops dn_proto_ops = { .family = AF_DECnet, .owner = THIS_MODULE, .release = dn_release, diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 8d0cc3cf3e49..33ab256cfd4a 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb) } } - if (!dn_db->router) { - dn_db->router = neigh_clone(neigh); - } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) - neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + /* Only use routers in our area */ + if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) { + if (!dn_db->router) { + dn_db->router = neigh_clone(neigh); + } else { + if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + } } write_unlock(&neigh->lock); neigh_release(neigh); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 369f25b60f3f..44bda85e678f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb) got_it: if (sk != NULL) { struct dn_scp *scp = DN_SK(sk); - int ret; /* Reset backoff */ scp->nsp_rxtshift = 0; @@ -807,21 +806,7 @@ got_it: goto free_out; } - bh_lock_sock(sk); - ret = NET_RX_SUCCESS; - if (decnet_debug_level & 8) - printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n", - (int)cb->rt_flags, (int)cb->nsp_flags, - (int)cb->src_port, (int)cb->dst_port, - !!sock_owned_by_user(sk)); - if (!sock_owned_by_user(sk)) - ret = dn_nsp_backlog_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + return sk_receive_skb(sk, skb); } return dn_nsp_no_socket(skb, reason); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 34fdac51df96..c792994d7952 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include <linux/if_arp.h> #include <linux/wireless.h> #include <linux/skbuff.h> +#include <linux/udp.h> #include <net/sock.h> #include <net/inet_common.h> #include <linux/stat.h> @@ -45,7 +46,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -static struct proto_ops econet_ops; +static const struct proto_ops econet_ops; static struct hlist_head econet_sklist; static DEFINE_RWLOCK(econet_lock); @@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256]; #define EC_PORT_IP 0xd2 #ifdef CONFIG_ECONET_AUNUDP -static spinlock_t aun_queue_lock; +static DEFINE_SPINLOCK(aun_queue_lock); static struct socket *udpsock; #define AUN_PORT 0x8000 @@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg break; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } /*NOTREACHED*/ return 0; @@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { .family = PF_ECONET, .owner = THIS_MODULE, .release = econet_release, diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 03efaacbdb73..4cc6f41c6930 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, return 1; } - if ((is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : - ieee->host_decrypt) { + if (is_multicast_ether_addr(hdr->addr1) + ? ieee->host_mc_decrypt : ieee->host_decrypt) { int idx = 0; if (skb->len >= hdrlen + 3) idx = skb->data[hdrlen + 3] >> 6; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e55136ae09f4..011cca7ae02b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -456,6 +456,14 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config TCP_CONG_CUBIC + tristate "CUBIC TCP" + default m + ---help--- + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f0435d00db6b..c54edd76de09 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d368cf249000..966a071a408c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -93,6 +93,7 @@ #include <linux/smp_lock.h> #include <linux/inet.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> @@ -302,6 +303,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = devinet_ioctl(cmd, (void __user *)arg); break; default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == - -ENOIOCTLCMD) - err = dev_ioctl(cmd, (void __user *)arg); + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; break; } return err; } -struct proto_ops inet_stream_ops = { +const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet_dgram_ops = { +const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = { * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ -static struct proto_ops inet_sockraw_ops = { +static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] = .ops = &inet_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }, { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 035ad2c9e1ba..aed537fa2c88 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -6,6 +6,7 @@ #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <net/icmp.h> +#include <net/protocol.h> #include <asm/scatterlist.h> diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b425748f02d7..37432088fe6d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -86,6 +86,7 @@ #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04a6fe3e95a2..7b9bb28e2ee9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -58,6 +58,7 @@ #endif #include <linux/kmod.h> +#include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1b18ce66e7b7..73bfcae8af9c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -9,6 +9,7 @@ #include <linux/pfkeyv2.h> #include <linux/random.h> #include <net/icmp.h> +#include <net/protocol.h> #include <net/udp.h> /* decapsulation data for use when post-processing */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 19b1b984d687..18f5e509281a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -30,6 +30,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 7ea0209cb169..e2890ec8159e 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 0b298bbc1518..0dd4d06e456d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -33,6 +33,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6d2a6ac070e3..ef4724de7350 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> @@ -36,6 +37,7 @@ #include <linux/netlink.h> #include <linux/init.h> +#include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 705e3ce86df9..e320b32373e5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -41,6 +41,13 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Substantial contributions to this work comes from: + * + * David S. Miller, <davem@davemloft.net> + * Stephen Hemminger <shemminger@osdl.org> + * Paul E. McKenney <paulmck@us.ibm.com> + * Patrick McHardy <kaber@trash.net> */ #define VERSION "0.404" @@ -59,6 +66,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 92e23b2ad4d2..be5a519cd2f8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -73,6 +73,7 @@ #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4a195c724f01..34758118c10c 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,8 @@ #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> + +#include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3fe021f1a566..ae20281d8deb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke return node != NULL; } +EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -125,7 +130,7 @@ tb_found: goto success; } else { ret = 1; - if (inet_csk_bind_conflict(sk, tb)) + if (bind_conflict(sk, tb)) goto fail_unlock; } } @@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout) + unsigned long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; @@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + +void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + const struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 39061ed53cfd..c49908192047 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); + &tw6->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); + &tw6->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6_rsk(req)->loc_addr); + &inet6_rsk(req)->loc_addr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6_rsk(req)->rmt_addr); + &inet6_rsk(req)->rmt_addr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.saddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : + inet6_rsk(req)->loc_addr.s6_addr32 : #endif &ireq->loc_addr; entry.daddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : + inet6_rsk(req)->rmt_addr.s6_addr32 : #endif &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8d29fe736d2..33228115cda4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,12 +15,14 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#include <net/ip.h> /* * Allocate and initialize a new local port bind bucket. @@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } EXPORT_SYMBOL_GPL(__inet_lookup_listener); + +/* called with local bh disabled */ +static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hash = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + inet_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet_hash(hinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row);; + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __inet_hash(hinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a010e9a68811..417f126c749e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, - SLAB_ATOMIC); + struct inet_timewait_sock *tw = + kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + SLAB_ATOMIC); if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2fc3fd38924f..ce5fe3f74a3d 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) return NULL; n->v4daddr = daddr; atomic_set(&n->refcnt, 1); + atomic_set(&n->rid, 0); n->ip_id_count = secure_ip_id(daddr); n->tcp_ts_stamp = 0; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8ce0ce2ee48e..ce2b70ce4018 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include <linux/compiler.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -38,6 +39,7 @@ #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> +#include <net/inetpeer.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -89,8 +93,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) return ip_frag_create(hash, iph, user); } +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; +} + /* Add new segment to existing queue. */ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { @@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 473d0f2b2e0d..e45846ae570b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -128,6 +128,7 @@ #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dbe12da8d8b3..d3f6c468faf4 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -22,6 +22,7 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> +#include <net/route.h> /* * Write options to IP header, record destination address to diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index eba64e2bd397..2a830de3a699 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4f2d87257309..6986e11d65cc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -25,12 +25,12 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <net/tcp.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/netfilter.h> @@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; - if (sk->sk_type == SOCK_STREAM) { - struct tcp_sock *tp = tcp_sk(sk); + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & @@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, inet->daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) - tp->ext_header_len -= inet->opt->optlen; + icsk->icsk_ext_hdr_len -= inet->opt->optlen; if (opt) - tp->ext_header_len += opt->optlen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index fc718df17b40..d64e2ec8da7b 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -28,6 +28,7 @@ #include <net/xfrm.h> #include <net/icmp.h> #include <net/ipcomp.h> +#include <net/protocol.h> struct ipcomp_tfms { struct list_head list; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e8674baaa8d9..bb3613ec448c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -42,6 +42,7 @@ #include <linux/in.h> #include <linux/if.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> @@ -58,6 +59,7 @@ #include <net/arp.h> #include <net/ip.h> #include <net/ipconfig.h> +#include <net/route.h> #include <asm/uaccess.h> #include <net/checksum.h> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb507c9..caa3b7d2e48a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -52,6 +52,7 @@ #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <net/route.h> #include <net/sock.h> #include <net/icmp.h> #include <net/udp.h> diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d7eb680101c2..9b176a942ac5 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app) } -#if 0000 -/* - * Get reference to app by name (called from user context) - */ -struct ip_vs_app *ip_vs_app_get_by_name(char *appname) -{ - struct ip_vs_app *app, *a = NULL; - - down(&__ip_vs_app_mutex); - - list_for_each_entry(ent, &ip_vs_app_list, a_list) { - if (strcmp(app->name, appname)) - continue; - - /* softirq may call ip_vs_app_get too, so the caller - must disable softirq on the current CPU */ - if (ip_vs_app_get(app)) - a = app; - break; - } - - up(&__ip_vs_app_mutex); - - return a; -} -#endif - - /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 2a3a8c59c655..81d90354c928 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -24,7 +24,10 @@ * */ +#include <linux/in.h> +#include <linux/net.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ #include <linux/seq_file.h> @@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); - IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get out: ct_read_unlock(hash); - IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); - IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; - IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) if (!dest) return; - IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_hash(cp); expire_later: - IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", atomic_read(&cp->refcnt)-1, atomic_read(&cp->n_control)); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1a0843cd58a9..1aca94a9fd8b 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 9bdcf31b760e..c935c5086d33 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,7 @@ #include <linux/netfilter_ipv4.h> #include <net/ip.h> +#include <net/route.h> #include <net/sock.h> #include <asm/uaccess.h> @@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) out: read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", fwmark, ip_vs_proto_name(protocol), NIPQUAD(vaddr), ntohs(vport), svc?"hit":"not hit"); @@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) */ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "refcnt=%d\n", + "dest->refcnt=%d\n", dest->vfwmark, NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); @@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) dest = ip_vs_trash_get_dest(svc, daddr, dport); if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", NIPQUAD(daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, @@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) atomic_dec(&dest->svc->refcnt); kfree(dest); } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " + "dest->refcnt=%d\n", NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index f3bc320dce93..9fee19c4c617 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -37,8 +37,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 67b3e2fc1fa1..e7004741ac73 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -13,7 +13,10 @@ * Changes: * */ +#include <linux/config.h> #include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/slab.h> #include <linux/types.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 561cda326fa8..6e5cb92a5c83 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -41,8 +41,10 @@ * me to write this module. */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> /* for sysctl */ #include <linux/fs.h> @@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. - * returns bool success. - */ -static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, - struct ip_vs_lblc_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblc_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index ce456dbf09a5..32ba37ba72d8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -39,8 +39,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> /* for sysctl */ #include <linux/fs.h> @@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. - * returns bool success. - */ -static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, - struct ip_vs_lblcr_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblcr_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 453e94a0bbd7..8b0505b09317 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -12,6 +12,8 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 478e5c7c7e8e..c36ccf057a19 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -12,6 +12,8 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 0e878fd6215c..bc28b1160a3a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = 2*HZ, }; - -#if 0 - -/* FIXME: This is going to die */ - -static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 10*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 100*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -#endif - static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", @@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_dest *dest = cp->dest; IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", pp->name, (state_off==TCP_DIR_OUTPUT)?"output ":"input ", th->syn? 'S' : '.', diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8ae5f2e0aefa..89d9175d8f28 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -15,8 +15,11 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/kernel.h> #include <linux/netfilter_ipv4.h> +#include <linux/udp.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 6f7c50e44a39..7775e6cc68be 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -34,8 +34,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2e5ced3d8062..1bca714bda3d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -21,12 +21,14 @@ #include <linux/module.h> #include <linux/slab.h> +#include <linux/inetdevice.h> #include <linux/net.h> #include <linux/completion.h> #include <linux/delay.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/igmp.h> /* for ip_mc_join_group */ +#include <linux/udp.h> #include <net/ip.h> #include <net/sock.h> diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3c2e9639bba6..bba156304695 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -68,19 +68,14 @@ struct arpt_table_info { unsigned int initial_entries; unsigned int hook_entry[NF_ARP_NUMHOOKS]; unsigned int underflow[NF_ARP_NUMHOOKS]; - char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); + void *entries[NR_CPUS]; }; static LIST_HEAD(arpt_target); static LIST_HEAD(arpt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, char *hdr_addr, int len) { @@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, outdev = out ? out->name : nulldevname; read_lock_bh(&table->lock); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, - smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); back = get_entry(table_base, table->private->underflow[hook]); @@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +static int mark_source_chains(struct arpt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e - = (struct arpt_entry *)(newinfo->entries + pos); + = (struct arpt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali goto next; e = (struct arpt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct arpt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali newpos = pos + e->next_offset; } e = (struct arpt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) static int translate_table(const char *name, unsigned int valid_hooks, struct arpt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -710,11 +705,11 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -743,29 +738,26 @@ static int translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) { + if (!mark_source_chains(newinfo, valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e, return 0; } +static inline int set_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct arpt_table_info *t, struct arpt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + ARPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size, struct arpt_entry *e; struct arpt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ struct arpt_entry_target *t; - e = (struct arpt_entry *)(table->private->entries + off); + e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], @@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries, return ret; } +static void free_table_info(struct arpt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct arpt_table_info *alloc_table_info(unsigned int size) +{ + struct arpt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { int ret; @@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len) struct arpt_table *t; struct arpt_table_info *newinfo, *oldinfo; struct arpt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct arpt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len) module_put(t->me); up(&arpt_mutex); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len) struct arpt_counters_info tmp, *paddc; struct arpt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len) } i = 0; - ARPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table, struct arpt_table_info *newinfo; static struct arpt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) { ret = -ENOMEM; return ret; } - memcpy(newinfo->entries, repl->entries, repl->size); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&arpt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void arpt_unregister_table(struct arpt_table *table) { + void *loc_cpu_entry; + down(&arpt_mutex); LIST_DELETE(&arpt_tables, table); up(&arpt_mutex); /* Decrease module usage counts and free resources */ - ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index e52847fa10f5..0366eedb4d70 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -18,11 +18,13 @@ * */ +#include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/ip.h> #include <linux/moduleparam.h> +#include <linux/udp.h> #include <net/checksum.h> #include <net/udp.h> diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 744abb9d377a..57956dee60c8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -31,6 +31,7 @@ #include <linux/ip.h> #include <linux/in.h> #include <linux/list.h> +#include <linux/seq_file.h> static DEFINE_RWLOCK(ip_ct_gre_lock); #define ASSERT_READ_LOCK(x) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index f2dcac7c7660..46becbe4fe58 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -11,6 +11,7 @@ #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> +#include <linux/ip.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <net/checksum.h> diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dd476b191f4b..a88bcc551244 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -27,6 +27,7 @@ #endif #include <net/checksum.h> #include <net/ip.h> +#include <net/route.h> #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 8acb7ed40b47..4f95d477805c 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -44,6 +44,7 @@ * */ #include <linux/config.h> +#include <linux/in.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -53,6 +54,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_nat_helper.h> #include <linux/ip.h> +#include <linux/udp.h> #include <net/checksum.h> #include <net/udp.h> #include <asm/uaccess.h> diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 45886c8475e8..2a26d167e149 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -105,20 +100,15 @@ struct ipt_table_info unsigned int underflow[NF_IP_NUMHOOKS]; /* ipt_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ipt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e - = (struct ipt_entry *)(newinfo->entries + pos); + = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ipt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ipt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ipt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -855,6 +845,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ipt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -875,11 +866,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -907,27 +898,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -943,15 +931,12 @@ replace_table(struct ipt_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ipt_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ipt_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ipt_table_info *t, struct ipt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry *e; struct ipt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct ipt_counters) * table->private->number; - counters = vmalloc(countersize); + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry_match *m; struct ipt_entry_target *t; - e = (struct ipt_entry *)(table->private->entries + off); + e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries, return ret; } +static void free_table_info(struct ipt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ipt_table_info *alloc_table_info(unsigned int size) +{ + struct ipt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu)); + if (newinfo->entries[cpu] == 0) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len) struct ipt_table *t; struct ipt_table_info *newinfo, *oldinfo; struct ipt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ipt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ipt_mutex); free_newinfo_counters_untrans: - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len) struct ipt_counters_info tmp, *paddc; struct ipt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len) if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) struct ipt_table_info *newinfo; static struct ipt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu + * but dont care of preemption + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ipt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ipt_unregister_table(struct ipt_table *table) { + void *loc_cpu_entry; + down(&ipt_mutex); LIST_DELETE(&ipt_tables, table); up(&ipt_mutex); /* Decrease module usage counts and free resources */ - IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 275a174c6fe6..27860510ca6d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,6 +11,7 @@ #include <linux/config.h> #include <linux/types.h> +#include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/timer.h> #include <linux/module.h> @@ -18,6 +19,7 @@ #include <net/protocol.h> #include <net/ip.h> #include <net/checksum.h> +#include <net/route.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_nat_rule.h> #include <linux/netfilter_ipv4/ip_tables.h> diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c index 1a53924041fc..03f554857a4d 100644 --- a/net/ipv4/netfilter/ipt_physdev.c +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -9,6 +9,7 @@ */ #include <linux/module.h> +#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/netfilter_ipv4/ipt_physdev.h> #include <linux/netfilter_ipv4/ip_tables.h> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0d7dc668db46..39d49dc333a7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -38,6 +38,7 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> +#include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/sock.h> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60ea48a1..e20be3331f67 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; - child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) inet_csk_reqsk_queue_add(sk, req, child); else diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 01444a02b48b..16984d4a8a06 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include <linux/sysctl.h> #include <linux/config.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -22,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind; #ifdef CONFIG_SYSCTL +static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -614,6 +616,15 @@ ctl_table ipv4_table[] = { .strategy = &sysctl_jiffies }, { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef98b14ac56d..00aa80e93243 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int err = 0; if (level != SOL_TCP) - return tp->af_specific->setsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); /* This is a string value all the others are int's */ if (optname == TCP_CONGESTION) { @@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); - info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; @@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int val, len; if (level != SOL_TCP) - return tp->af_specific->getsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); if (get_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 1d0cd86621b1..035f2092d73a 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -30,8 +30,6 @@ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int low_utilization_threshold = 153; -static int low_utilization_period = 2; static int initial_ssthresh = 100; static int smooth_part = 20; @@ -43,10 +41,6 @@ module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); -module_param(low_utilization_threshold, int, 0644); -MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); -module_param(low_utilization_period, int, 0644); -MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); @@ -60,11 +54,6 @@ struct bictcp { u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ - u32 delay_min; /* min delay */ - u32 delay_max; /* max delay */ - u32 last_delay; - u8 low_utilization;/* 0: high; 1: low */ - u32 low_utilization_start; /* starting time of low utilization detection*/ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ @@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; - ca->delay_min = 0; - ca->delay_max = 0; - ca->last_delay = 0; - ca->low_utilization = 0; - ca->low_utilization_start = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } @@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } /* if in slow start or link utilization is very low */ - if ( ca->loss_cwnd == 0 || - (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->loss_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } @@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } - -/* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct sock *sk, int flag) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); - u32 dist, delay; - - /* No time stamp */ - if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - tcp_time_stamp < ca->epoch_start + HZ || - /* this delay samples may not be accurate */ - flag == 0) { - ca->last_delay = 0; - goto notlow; - } - - delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ - ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - if (delay == 0) /* no previous delay sample */ - goto notlow; - - /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) { - ca->delay_min = ca->delay_max = delay; - goto notlow; - } - - if (ca->delay_max < delay) - ca->delay_max = delay; - - /* utilization is low, if avg delay < dist*threshold - for checking_period time */ - dist = ca->delay_max - ca->delay_min; - if (dist <= ca->delay_min>>6 || - tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) - goto notlow; - - if (ca->low_utilization_start == 0) { - ca->low_utilization = 0; - ca->low_utilization_start = tcp_time_stamp; - } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) - > low_utilization_period*HZ) { - ca->low_utilization = 1; - } - - return; - - notlow: - ca->low_utilization = 0; - ca->low_utilization_start = 0; - -} - static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(sk, data_acked); - if (!tcp_is_cwnd_limited(sk, in_flight)) return; @@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->epoch_start = 0; /* end of epoch */ - /* in case of wrong delay_max*/ - if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) - ca->delay_max = ca->delay_min - + ((ca->delay_max - ca->delay_min)* 90) / 100; - /* Wmax and fast convergence */ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) @@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state) bictcp_reset(inet_csk_ca(sk)); } -/* Track delayed acknowledgement ratio using sliding window +/* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, u32 cnt) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c7cc62c8dc12..e688c687d62d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } + +/* + * Linear increase during slow start + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + /* * TCP Reno congestion control * This is special case used for fallback as well. diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c new file mode 100644 index 000000000000..31a4986dfbf7 --- /dev/null +++ b/net/ipv4/tcp_cubic.c @@ -0,0 +1,411 @@ +/* + * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * + * This is from the implementation of CUBIC TCP in + * Injong Rhee, Lisong Xu. + * "CUBIC: A New TCP-Friendly High-Speed TCP Variant + * in PFLDnet 2005 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + * + * Unless CUBIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> +#include <asm/div64.h> + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +static int fast_convergence = 1; +static int max_increment = 16; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh = 100; +static int bic_scale = 41; +static int tcp_friendliness = 1; + +static u32 cube_rtt_scale; +static u32 beta_scale; +static u64 cube_factor; + +/* Note parameters that are used for precomputing scale factors are read-only */ +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(beta, int, 0444); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(bic_scale, int, 0444); +MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); +module_param(tcp_friendliness, int, 0644); +MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); + +#include <asm/div64.h> + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 bic_origin_point;/* origin point of bic function */ + u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 delay_min; /* min delay */ + u32 epoch_start; /* beginning of an epoch */ + u32 ack_cnt; /* number of acks */ + u32 tcp_cwnd; /* estimated tcp cwnd */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + if (initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +{ + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} + +/* + * calculate the cubic root of x using Newton-Raphson + */ +static u32 cubic_root(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * cbrt(x) = exp(log(x) / 3) + */ + x = 1u << (fls64(a)/3); + + /* + * Iteration based on: + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + do { + x1 = x; + x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; + } while (abs(x1 - x) > 1); + + return x; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + u64 offs; + u32 delta, t, bic_target, min_cnt, max_cnt; + + ca->ack_cnt++; /* count the number of ACKs */ + + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->ack_cnt = 1; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using 64 bit) + * and without the support of division of 64bit numbers + * (so all divisions are done by using 32 bit) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + /* change the unit from HZ to bictcp_HZ */ + t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) + << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; + else + offs = t - ca->bic_K; + + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - delta; + else /* above origin*/ + bic_target = ca->bic_origin_point + delta; + + /* cubic function - calc bictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + if (ca->delay_min > 0) { + /* max increment = Smax * rtt / 0.1 */ + min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); + if (ca->cnt < min_cnt) + ca->cnt = min_cnt; + } + + /* slow start and low utilization */ + if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ + ca->cnt = 50; + + /* TCP Friendly */ + if (tcp_friendliness) { + u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; + while (ca->ack_cnt > delta) { /* update tcp cwnd */ + ca->ack_cnt -= delta; + ca->tcp_cwnd++; + } + + if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Keep track of minimum rtt */ +static inline void measure_delay(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (data_acked) + measure_delay(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + struct bictcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops cubictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "cubic", +}; + +static int __init cubictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + + /* Precompute a bunch of the scaling factors that are used per-packet + * based on SRTT of 100ms + */ + + beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + + cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is bictcp_HZ=2^10, not HZ + * + * c = bic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + + /* 1/c * 2^2*bictcp_HZ * srtt */ + cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ + + /* divide by bic_scale and by constant Srtt (100ms) */ + do_div(cube_factor, bic_scale * 10); + + return tcp_register_congestion_control(&cubictcp); +} + +static void __exit cubictcp_unregister(void) +{ + tcp_unregister_congestion_control(&cubictcp); +} + +module_init(cubictcp_register); +module_exit(cubictcp_unregister); + +MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CUBIC TCP"); +MODULE_VERSION("2.0"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf2e23086bce..0a461232329f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && @@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); } + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ +void tcp_initialize_rcv_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, TCP_MIN_RCVMSS); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} + /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on @@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +/* Set slow start threshold and cwnd not falling to slow start */ +void tcp_enter_cwr(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); + + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); @@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); @@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static inline u32 tcp_usrtt(const struct sk_buff *skb) +static u32 tcp_usrtt(const struct sk_buff *skb) { struct timeval tv, now; @@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, if (nwin > tp->max_window) { tp->max_window = nwin; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } @@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) +static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) { if (th->doff == sizeof(struct tcphdr)>>2) { tp->rx_opt.saw_tstamp = 0; @@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) } } -static __inline__ int -tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) return 0; } -static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) @@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (!tp->rx_opt.dsack) tcp_dsack_set(tp, seq, end_seq); @@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; @@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) { /* If the user specified a specific send buffer setting, do * not modify it. @@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk) sk->sk_write_space(sk); } -static inline void tcp_check_space(struct sock *sk) +static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); @@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { tcp_push_pending_frames(sk, tp); tcp_check_space(sk); @@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) } } -static __inline__ void tcp_ack_snd_check(struct sock *sk) +static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ @@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) return result; } -static __inline__ int -tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete_user(sk, skb); @@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { - struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! @@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_set_state(sk, TCP_ESTABLISHED); /* Make sure socket is routed, for correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - icsk = inet_csk(sk); - if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -4173,7 +4210,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; tp->rx_opt.saw_tstamp = 0; @@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if(th->syn) { - if(tp->af_specific->conn_request(sk, skb) < 0) + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is @@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Make sure socket is routed, for * correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d5021e1929b..e9f83e5b28ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -69,6 +69,7 @@ #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/timewait_sock.h> #include <net/xfrm.h> #include <linux/inet.h> @@ -86,8 +87,7 @@ int sysctl_tcp_low_latency; /* Socket used for sending RSTs */ static struct socket *tcp_socket; -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb); +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, @@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - return inet_csk_get_port(&tcp_hashinfo, sk, snum); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void tcp_v4_hash(struct sock *sk) @@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } -/* called with local bh disabled */ -static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) +int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { - struct inet_sock *inet = inet_sk(sk); - u32 daddr = inet->rcv_saddr; - u32 saddr = inet->daddr; - int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - /* With PAWS, it is safe from the viewpoint - of data integrity. Even without PAWS it - is safe provided sequence spaces do not - overlap i.e. at data rates <= 80Mbit/sec. - - Actually, the idea is close to VJ's one, - only timestamp cache is held not per host, - but per port pair and TW bucket is used - as state holder. + const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); + struct tcp_sock *tp = tcp_sk(sk); - If TW bucket has been already destroyed we - fall back to VJ's scheme and use initial - timestamp retrieved from peer table. - */ - if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); - goto unique; - } else - goto not_unique; - } - } - tw = NULL; + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it is safe provided sequence + spaces do not overlap i.e. at data rates <= 80Mbit/sec. - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } + Actually, the idea is close to VJ's one, only timestamp cache is + held not per host, but per port pair and TW bucket is used as state + holder. -unique: - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); + If TW bucket has been already destroyed we fall back to VJ's scheme + and use initial timestamp retrieved from peer table. + */ + if (tcptw->tw_ts_recent_stamp && + (twp == NULL || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + sock_hold(sktw); + return 1; } return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; } -static inline u32 connect_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - - return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); -} - -/* - * Bind a port for a connect operation and hash it. - */ -static inline int tcp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + connect_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v4_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet_hash(&tcp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row);; - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(&tcp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} +EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - tp->ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) - tp->ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; @@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v4_hash_connect(sk); + err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; @@ -433,12 +270,10 @@ failure: /* * This routine does path mtu discovery as defined in RFC1191. */ -static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, - u32 mtu) +static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - tp->pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's @@ -644,10 +479,10 @@ out: } /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); @@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static inline void syn_flood_warning(struct sk_buff *skb) +#ifdef CONFIG_SYN_COOKIES +static void syn_flood_warning(struct sk_buff *skb) { static unsigned long warntime; @@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb) ntohs(skb->h.th->dest)); } } +#endif /* * Save and compile IPv4 options into the request_sock if needed. */ -static inline struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *dopt = NULL; @@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = { .send_reset = tcp_v4_send_reset, }; +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) - newtp->ext_header_len = newinet->opt->optlen; + inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1314,16 +1156,6 @@ do_time_wait: goto discard_it; } -static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; -} - /* VJ's idea. Save last timestamp seen from this destination * and hold it at least for normal timewait interval to use for duplicate * segment detection in subsequent connections, before they enter synchronized @@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct tcp_func ipv4_specific = { +struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = v4_addr2sockaddr, + .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), }; @@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1989,7 +1822,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1b66a2ac4321..2b9b7f6c7f7c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,18 +274,18 @@ kill: void tcp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tp->af_specific->remember_stamp(sk); + recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6; - ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; } #endif @@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock **prev) { struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, + req, NULL); if (child == NULL) goto listen_overflow; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7325e0b406a..a7623ead39a8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; -static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) @@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } -static inline void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) +static void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; @@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); @@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * value can be stuffed directly into th->window for an outgoing * frame. */ -static __inline__ u16 tcp_select_window(struct sock *sk) +static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); @@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk) return new_win; } +static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, + __u32 tstamp) +{ + if (tp->rx_opt.tstamp_ok) { + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); + *ptr++ = htonl(tp->rx_opt.ts_recent); + } + if (tp->rx_opt.eff_sacks) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks--; + } + } +} + +/* Construct a tcp options header for a SYN or SYN_ACK packet. + * If this is every changed make sure to change the definition of + * MAX_SYN_SIZE to match the new maximum number of options that you + * can generate. + */ +static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, + int offer_wscale, int wscale, __u32 tstamp, + __u32 ts_recent) +{ + /* We always get an MSS option. + * The option bytes which will be seen in normal data + * packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so + * that calculations in tcp_sendmsg are simpler etc. + * So account for this fact here if necessary. If we + * don't do this correctly, as a receiver we won't + * recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK + * rules correctly. + * SACKs don't matter, we never delay an ACK when we + * have any of those going out. + */ + *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + if (ts) { + if(sack) + *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); /* TSVAL */ + *ptr++ = htonl(ts_recent); /* TSECR */ + } else if(sack) + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (offer_wscale) + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); +} /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial @@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ECN_send(sk, tp, skb, tcp_header_size); } - tp->af_specific->send_check(sk, th, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); @@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_INC_STATS(TCP_MIB_OUTSEGS); - err = tp->af_specific->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (unlikely(err <= 0)) return err; @@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) It is minimum of user_mss and mss received with SYN. It also does not include TCP options. - tp->pmtu_cookie is last pmtu, seen by this function. + inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, @@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. - NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside - this function. --ANK (980731) + NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache + are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; - + struct inet_connection_sock *icsk = inet_csk(sk); /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - + sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); /* And store cached results */ - tp->pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; tp->mss_cache = mss_now; return mss_now; @@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (dst) { u32 mtu = dst_mtu(dst); - if (mtu != tp->pmtu_cookie) + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } @@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = 65535 - - tp->af_specific->net_header_len - - tp->ext_header_len - tp->tcp_header_len; + xmit_size_goal = (65535 - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) @@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out = tp->packets_out; @@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); - if(tp->af_specific->rebuild_header(sk)) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a @@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* * Do all connect socket setups that can be done AF independent. */ -static inline void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13e7e6e8df16..3b7403495052 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } + /* Use normal slow start */ + else if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2422a5f7195d..223abaa72bc5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -86,6 +86,7 @@ #include <linux/module.h> #include <linux/socket.h> #include <linux/sockios.h> +#include <linux/igmp.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> @@ -846,20 +847,7 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS); - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (noblock) return -EAGAIN; @@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, * Otherwise, csum completion requires chacksumming packet body, * including udp header and folding it to skb->csum. */ -static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, +static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { if (uh->check == 0) { @@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, /* Probably, we should checksum udp header (it should be in cache * in any case) and data in tiny packets (< rx copybreak). */ - return 0; } /* @@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb) if (pskb_trim_rcsum(skb, ulen)) goto short_packet; - if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) - goto csum_error; + udp_checksum_init(skb, uh, ulen, saddr, daddr); if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6460eec834b7..9601fd7f9d66 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o netfilter.o + ip6_flowlabel.o ipv6_syms.o netfilter.o \ + inet6_connection_sock.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a60585fd85ad..704fb73e6c5f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device * int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d9546380fa04..68afc53be662 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -167,6 +167,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -389,6 +390,8 @@ int inet6_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(inet6_destroy_sock); + /* * This does both peername and sockname. */ @@ -431,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; - int err = -EINVAL; switch(cmd) { @@ -450,16 +452,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr((void __user *) arg); default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) - return(dev_ioctl(cmd,(void __user *) arg)); - return err; + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return(0); } -struct proto_ops inet6_stream_ops = { +const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -480,7 +481,7 @@ struct proto_ops inet6_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet6_dgram_ops = { +const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -508,7 +509,7 @@ static struct net_proto_family inet6_family_ops = { }; /* Same as inet6_dgram_ops, sans udp_poll. */ -static struct proto_ops inet6_sockraw_ops = { +static const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -609,6 +610,79 @@ inet6_unregister_protosw(struct inet_protosw *p) } } +int inet6_sk_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index f3629730eb15..13cc7f895583 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <net/xfrm.h> #include <asm/scatterlist.h> diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bfbe9970793..6de8ee1a5ad9 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <linux/random.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <linux/icmpv6.h> static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index be6faf311387..113374dc342c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) return opt; } +EXPORT_SYMBOL_GPL(ipv6_invert_rthdr); + /********************************** Hop-by-hop options. **********************************/ @@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) return opt2; } +EXPORT_SYMBOL_GPL(ipv6_dup_options); + static int ipv6_renew_option(void *ohdr, struct ipv6_opt_hdr __user *newopt, int newoptlen, int inherit, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 000000000000..792f90f0f9ec --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,199 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> + +#include <net/addrconf.h> +#include <net/inet_connection_sock.h> +#include <net/inet_ecn.h> +#include <net/inet_hashtables.h> +#include <net/ip6_route.h> +#include <net/sock.h> + +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + u32 a = raddr->s6_addr32[0]; + u32 b = raddr->s6_addr32[1]; + u32 c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32)rport; + __jhash_mix(a, b, c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet6_request_sock *treq = inet6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 01d5f46d4e40..4154f3a8b6cf 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -5,7 +5,8 @@ * * Generic INET6 transport hashtables * - * Authors: Lotsa people, from code originally in tcp + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo <acme@mandriva.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,12 +15,13 @@ */ #include <linux/config.h> - #include <linux/module.h> +#include <linux/random.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> +#include <net/ip.h> struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, @@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, } EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, + inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); + + tw = inet_twsk(sk2); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hash = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + const int low = sysctl_local_port_range[0]; + const int high = sysctl_local_port_range[1]; + const int range = high - low; + int i, port; + static u32 hint; + const u32 offset = hint + inet6_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet6_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet6_hash(hinfo, sk); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet6_hash(hinfo, sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet6_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1cf02765fb5c..89d12b4817a9 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label) return NULL; } +EXPORT_SYMBOL_GPL(fl6_sock_lookup); + void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8523c76ebf76..b4c4beba0ede 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -775,6 +775,8 @@ out_err_release: return err; } +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 55917fb17094..626dd39685f2 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -47,6 +47,7 @@ #include <linux/rtnetlink.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3620718defe6..c63868dd2ca2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sk_refcnt_debug_dec(sk); if (sk->sk_protocol == IPPROTO_TCP) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { local_bh_disable(); sock_prot_dec_use(sk->sk_prot); @@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); @@ -380,14 +381,15 @@ sticky_done: goto done; update: retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f829a4ad3ccc..1cf305a9f8dd 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; - mc_lst->sflock = RW_LOCK_UNLOCKED; + rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; /* diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 95d469271c4d..ea43ef1d94a7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -15,6 +15,7 @@ * - new extension header parser code */ #include <linux/config.h> +#include <linux/in.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> @@ -86,11 +87,6 @@ static DECLARE_MUTEX(ip6t_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -108,20 +104,15 @@ struct ip6t_table_info unsigned int underflow[NF_IP6_NUMHOOKS]; /* ip6t_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ip6t_target); static LIST_HEAD(ip6t_match); static LIST_HEAD(ip6t_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -376,8 +367,7 @@ ip6t_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -649,7 +639,8 @@ unconditional(const struct ip6t_ip6 *ipv6) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ip6t_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -658,7 +649,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e - = (struct ip6t_entry *)(newinfo->entries + pos); + = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -708,13 +699,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ip6t_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ip6t_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -731,7 +722,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ip6t_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -941,6 +932,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ip6t_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -961,11 +953,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -993,27 +985,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -1029,15 +1018,12 @@ replace_table(struct ip6t_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ip6t_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ip6t_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -1072,16 +1058,44 @@ add_entry_to_counter(const struct ip6t_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ip6t_table_info *t, struct ip6t_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IP6T_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1098,6 +1112,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry *e; struct ip6t_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1109,13 +1124,13 @@ copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on ourc node/cpu */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1127,7 +1142,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry_match *m; struct ip6t_entry_target *t; - e = (struct ip6t_entry *)(table->private->entries + off); + e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1196,6 +1211,46 @@ get_entries(const struct ip6t_get_entries *entries, return ret; } +static void free_table_info(struct ip6t_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ip6t_table_info *alloc_table_info(unsigned int size) +{ + struct ip6t_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1204,6 +1259,7 @@ do_replace(void __user *user, unsigned int len) struct ip6t_table *t; struct ip6t_table_info *newinfo, *oldinfo; struct ip6t_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1212,13 +1268,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1229,10 +1285,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1271,8 +1326,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ip6t_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1284,11 +1340,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ip6t_mutex); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1321,6 +1377,7 @@ do_add_counters(void __user *user, unsigned int len) struct ip6t_counters_info tmp, *paddc; struct ip6t_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1350,7 +1407,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IP6T_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1543,28 +1602,29 @@ int ip6t_register_table(struct ip6t_table *table, struct ip6t_table_info *newinfo; static struct ip6t_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ip6t_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1593,20 +1653,23 @@ int ip6t_register_table(struct ip6t_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ip6t_unregister_table(struct ip6t_table *table) { + void *loc_cpu_entry; + down(&ip6t_mutex); LIST_DELETE(&ip6t_tables, table); up(&ip6t_mutex); /* Decrease module usage counts and free resources */ - IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0cd1d1bd9033..ae4653bfd654 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/ip.h> #include <linux/spinlock.h> #include <linux/icmpv6.h> diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index dde37793d20b..268918d5deea 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c index 24bc0cde43a1..65937de1b58c 100644 --- a/net/ipv6/netfilter/ip6t_esp.c +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c2c52af9e560..f3e5ffbd592f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -98,7 +98,7 @@ struct nf_ct_frag6_queue #define FRAG6Q_HASHSZ 64 static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; -static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(nf_ct_frag6_lock); static u32 nf_ct_frag6_hash_rnd; static LIST_HEAD(nf_ct_frag6_lru_list); int nf_ct_frag6_nqueues = 0; @@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct init_timer(&fq->timer); fq->timer.function = nf_ct_frag6_expire; fq->timer.data = (long) fq; - fq->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&fq->lock); atomic_set(&fq->refcnt, 1); return nf_ct_frag6_intern(hash, fq); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a66900cda2af..66f1d12ea578 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -32,6 +32,7 @@ #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> +#include <linux/skbuff.h> #include <asm/uaccess.h> #include <asm/ioctls.h> #include <asm/bug.h> @@ -433,25 +434,14 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } + skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; /* FIXME: increment a raw6 drops counter here */ - goto out_free; + goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8827389abaf7..2947bc56d8a0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -48,6 +48,7 @@ #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> +#include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> @@ -59,6 +60,7 @@ #include <net/addrconf.h> #include <net/snmp.h> #include <net/dsfield.h> +#include <net/timewait_sock.h> #include <asm/uaccess.h> @@ -67,224 +69,33 @@ static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); -static struct tcp_func ipv6_mapped; -static struct tcp_func ipv6_specific; +static struct inet_connection_sock_af_ops ipv6_mapped; +static struct inet_connection_sock_af_ops ipv6_specific; -static inline int tcp_v6_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) -{ - const struct sock *sk2; - const struct hlist_node *node; - - /* We must walk the whole port owner list in this case. -DaveM */ - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; - } - - return node != NULL; -} - -/* Grrr, addr_type already calculated by caller, but I don't want - * to add some silly "cookie" argument to this method just for that. - * But it doesn't matter, the recalculation is in the rarest path - * this function ever takes. - */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - struct hlist_node *node; - int ret; - - local_bh_disable(); - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - - do { - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. */ - snum = rover; - } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (tb && !hlist_empty(&tb->owners)) { - if (tb->fastreuse > 0 && sk->sk_reuse && - sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_v6_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (tb == NULL) { - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); - if (tb == NULL) - goto fail_unlock; - } - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - -static __inline__ void __tcp_v6_hash(struct sock *sk) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - inet_listen_wlock(&tcp_hashinfo); - } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (tcp_hashinfo.ehash_size - 1); - list = &tcp_hashinfo.ehash[hash].chain; - lock = &tcp_hashinfo.ehash[hash].lock; - write_lock(lock); - } - - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet6_csk_bind_conflict); } - static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->af_specific == &ipv6_mapped) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { tcp_prot.hash(sk); return; } local_bh_disable(); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); local_bh_enable(); } } -/* - * Open request hash tables. - */ - -static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) -{ - u32 a, b, c; - - a = raddr->s6_addr32[0]; - b = raddr->s6_addr32[1]; - c = raddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); - - a += raddr->s6_addr32[3]; - b += (u32) rport; - __jhash_mix(a, b, c); - - return c & (TCP_SYNQ_HSIZE - 1); -} - -static struct request_sock *tcp_v6_search_req(const struct sock *sk, - struct request_sock ***prevp, - __u16 rport, - struct in6_addr *raddr, - struct in6_addr *laddr, - int iif) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct tcp6_request_sock *treq = tcp6_rsk(req); - - if (inet_rsk(req)->rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { - BUG_TRAP(req->sk == NULL); - *prevp = prev; - return req; - } - } - - return NULL; -} - static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, struct in6_addr *daddr, @@ -308,195 +119,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - if (tcptw->tw_ts_recent_stamp && - (!twp || - (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { - /* See comment in tcp_ipv4.c */ - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (!tp->write_seq) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); - goto unique; - } else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 tcpv6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int tcp_v6_hash_connect(struct sock *sk) -{ - unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + tcpv6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __tcp_v6_hash(sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v6_hash(sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p = NULL, final; @@ -571,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = tp->ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -583,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - tp->af_specific = &ipv6_mapped; + icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - tp->ext_header_len = exthdrlen; - tp->af_specific = &ipv6_specific; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -643,16 +271,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tp->ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v6_hash_connect(sk); + err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; @@ -758,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (tp->pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ @@ -775,8 +404,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -822,7 +451,7 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; struct ipv6_txoptions *opt = NULL; @@ -888,8 +517,8 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (tcp6_rsk(req)->pktopts) - kfree_skb(tcp6_rsk(req)->pktopts); + if (inet6_rsk(req)->pktopts) + kfree_skb(inet6_rsk(req)->pktopts); } static struct request_sock_ops tcp6_request_sock_ops = { @@ -901,26 +530,15 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; -static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - - if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) || - (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || - ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; - } - return 0; -} - +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); @@ -1091,8 +709,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->source, + &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1116,23 +735,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp6_request_sock *treq; + struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); @@ -1157,7 +765,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp6_request_sock_ops); + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) goto drop; @@ -1170,7 +778,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); - treq = tcp6_rsk(req); + treq = inet6_rsk(req); ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); TCP_ECN_create_request(req, skb->h.th); @@ -1196,8 +804,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (tcp_v6_send_synack(sk, req, NULL)) goto drop; - tcp_v6_synq_add(sk, req); - + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop: @@ -1212,7 +819,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; @@ -1247,7 +854,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); - newtp->af_specific = &ipv6_mapped; + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; @@ -1261,10 +868,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 af_tcp.af_specific. + worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - tcp_sync_mss(newsk, newtp->pmtu_cookie); + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -1371,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, sock_kfree_s(sk, opt, opt->tot_len); } - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newtp->ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); @@ -1382,7 +989,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; - __tcp_v6_hash(newsk); + __inet6_hash(&tcp_hashinfo, newsk); inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1679,139 +1286,16 @@ do_time_wait: goto discard_it; } -static int tcp_v6_rebuild_header(struct sock *sk) -{ - int err; - struct dst_entry *dst; - struct ipv6_pinfo *np = inet6_sk(sk); - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { - sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - return 0; -} - -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) -{ - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; - struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_sport = inet->sport; - fl.fl_ip_dport = inet->dport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); - - if (err) { - sk->sk_err_soft = -err; - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - skb->dst = dst_clone(dst); - - /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - - return ip6_xmit(sk, skb, &fl, np->opt, 0); -} - -static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); - sin6->sin6_port = inet_sk(sk)->dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; -} - static int tcp_v6_remember_stamp(struct sock *sk) { /* Alas, not yet... */ return 0; } -static struct tcp_func ipv6_specific = { - .queue_xmit = tcp_v6_xmit, +static struct inet_connection_sock_af_ops ipv6_specific = { + .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, - .rebuild_header = tcp_v6_rebuild_header, + .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v6_remember_stamp, @@ -1819,7 +1303,7 @@ static struct tcp_func ipv6_specific = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1827,7 +1311,7 @@ static struct tcp_func ipv6_specific = { * TCP over IPv4 via INET6 API */ -static struct tcp_func ipv6_mapped = { +static struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1838,7 +1322,7 @@ static struct tcp_func ipv6_mapped = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1877,8 +1361,9 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; - tp->af_specific = &ipv6_specific; + icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -1900,14 +1385,13 @@ static int tcp_v6_destroy_sock(struct sock *sk) static void get_openreq6(struct seq_file *seq, struct sock *sk, struct request_sock *req, int i, int uid) { - struct in6_addr *dest, *src; int ttd = req->expires - jiffies; + struct in6_addr *src = &inet6_rsk(req)->loc_addr; + struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; if (ttd < 0) ttd = 0; - src = &tcp6_rsk(req)->loc_addr; - dest = &tcp6_rsk(req)->rmt_addr; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", @@ -1988,14 +1472,14 @@ static void get_timewait6_sock(struct seq_file *seq, { struct in6_addr *dest, *src; __u16 destp, srcp; - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tcp6tw->tw_v6_daddr; - src = &tcp6tw->tw_v6_rcv_saddr; + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); @@ -2093,7 +1577,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, }; @@ -2110,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = { .ops = &inet6_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }; void __init tcpv6_init(void) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5cc8731eb55b..d8538dcea813 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -36,6 +36,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/init.h> +#include <linux/skbuff.h> #include <asm/uaccess.h> #include <net/sock.h> @@ -300,20 +301,7 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (flags & MSG_DONTWAIT) { UDP6_INC_STATS_USER(UDP_MIB_INERRORS); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 34b3bb868409..0dc519b40404 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink; static struct datalink_proto *p8023_datalink; static struct datalink_proto *pSNAP_datalink; -static struct proto_ops ipx_dgram_ops; +static const struct proto_ops ipx_dgram_ops; LIST_HEAD(ipx_interfaces); DEFINE_SPINLOCK(ipx_interfaces_lock); @@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = -EINVAL; break; default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } @@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { .family = PF_IPX, .owner = THIS_MODULE, .release = ipx_release, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 6f92f9c62990..fbfa96754417 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -62,12 +62,12 @@ static int irda_create(struct socket *sock, int protocol); -static struct proto_ops irda_stream_ops; -static struct proto_ops irda_seqpacket_ops; -static struct proto_ops irda_dgram_ops; +static const struct proto_ops irda_stream_ops; +static const struct proto_ops irda_seqpacket_ops; +static const struct proto_ops irda_dgram_ops; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops irda_ultra_ops; +static const struct proto_ops irda_ultra_ops; #define ULTRA_MAX_DATA 382 #endif /* CONFIG_IRDA_ULTRA */ @@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) - ret = sock_error(sk); + ret = sock_error(sk); + if (ret) + break; else if (sk->sk_shutdown & RCV_SHUTDOWN) ; else if (noblock) @@ -1821,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -EINVAL; default: IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); - return dev_ioctl(cmd, (void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ @@ -2463,7 +2464,7 @@ static struct net_proto_family irda_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2484,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2505,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2527,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { }; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, diff --git a/net/key/af_key.c b/net/key/af_key.c index 39031684b65c..52efd04cbedb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void) } -static struct proto_ops pfkey_ops; +static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { @@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = { [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), + [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ @@ -383,6 +384,55 @@ static int verify_address_len(void *p) return 0; } +static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx) +{ + int len = 0; + + len += sizeof(struct sadb_x_sec_ctx); + len += sec_ctx->sadb_x_ctx_len; + len += sizeof(uint64_t) - 1; + len /= sizeof(uint64_t); + + return len; +} + +static inline int verify_sec_ctx_len(void *p) +{ + struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p; + int len; + + if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE) + return -EINVAL; + + len = pfkey_sec_ctx_len(sec_ctx); + + if (sec_ctx->sadb_x_sec_len != len) + return -EINVAL; + + return 0; +} + +static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx) +{ + struct xfrm_user_sec_ctx *uctx = NULL; + int ctx_size = sec_ctx->sadb_x_ctx_len; + + uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL); + + if (!uctx) + return NULL; + + uctx->len = pfkey_sec_ctx_len(sec_ctx); + uctx->exttype = sec_ctx->sadb_x_sec_exttype; + uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; + uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; + uctx->ctx_len = sec_ctx->sadb_x_ctx_len; + memcpy(uctx + 1, sec_ctx + 1, + uctx->ctx_len); + + return uctx; +} + static int present_and_same_family(struct sadb_address *src, struct sadb_address *dst) { @@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h if (verify_address_len(p)) return -EINVAL; } + if (ext_type == SADB_X_EXT_SEC_CTX) { + if (verify_sec_ctx_len(p)) + return -EINVAL; + } ext_hdrs[ext_type-1] = p; } p += ext_len; @@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int ctx_size = 0; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); + + if ((xfrm_ctx = x->security)) { + ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); + size += sizeof(struct sadb_x_sec_ctx) + ctx_size; + } + /* identity & sensitivity */ if ((x->props.family == AF_INET && @@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, n_port->sadb_x_nat_t_port_reserved = 0; } + /* security context */ + if (xfrm_ctx) { + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, + sizeof(struct sadb_x_sec_ctx) + ctx_size); + sec_ctx->sadb_x_sec_len = + (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + return skb; } @@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, struct sadb_lifetime *lifetime; struct sadb_sa *sa; struct sadb_key *key; + struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; @@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } + + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + goto out; + + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + + if (err) + goto out; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; if (sa->sadb_sa_auth) { int keysize = 0; @@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) return 0; } +static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + + if (xfrm_ctx) { + int len = sizeof(struct sadb_x_sec_ctx); + len += xfrm_ctx->ctx_len; + return PFKEY_ALIGN8(len); + } + return 0; +} + static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) { int sockaddr_size = pfkey_sockaddr_size(xp->family); @@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + - (socklen * 2))); + (socklen * 2))) + + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) @@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i } } } + + /* security context */ + if ((xfrm_ctx = xp->security)) { + int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); + + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size); + sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); } @@ -1976,12 +2099,13 @@ out: static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) { - int err; + int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h if (xp->selector.dport) xp->selector.dport_mask = ~0; + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) { + err = -ENOBUFS; + goto out; + } + + err = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (err) + goto out; + } + xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; @@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); - if (err) { - kfree(xp); - return err; - } + + if (err) + goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; @@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h return 0; out: + security_xfrm_policy_free(xp); kfree(xp); return err; } @@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg int err; struct sadb_address *sa; struct sadb_x_policy *pol; - struct xfrm_policy *xp; + struct xfrm_policy *xp, tmp; struct xfrm_selector sel; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg if (sel.dport) sel.dport_mask = ~0; - xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + memset(&tmp, 0, sizeof(struct xfrm_policy)); + + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + return -ENOMEM; + + err = security_xfrm_policy_alloc(&tmp, uctx); + kfree(uctx); + + if (err) + return err; + } + + xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1); + security_xfrm_policy_free(&tmp); if (xp == NULL) return -ENOENT; @@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, { struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + struct sadb_x_sec_ctx *sec_ctx; switch (family) { case AF_INET: @@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; + /* security context too */ + if (len >= (pol->sadb_x_policy_len*8 + + sizeof(struct sadb_x_sec_ctx))) { + char *p = (char *)pol; + struct xfrm_user_sec_ctx *uctx; + + p += pol->sadb_x_policy_len*8; + sec_ctx = (struct sadb_x_sec_ctx *)p; + if (len < pol->sadb_x_policy_len*8 + + sec_ctx->sadb_x_sec_len) + goto out; + if ((*dir = verify_sec_ctx_len(p))) + goto out; + uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + *dir = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (*dir) + goto out; + } + *dir = pol->sadb_x_policy_dir-1; return xp; out: + security_xfrm_policy_free(xp); kfree(xp); return NULL; } @@ -2946,7 +3127,7 @@ out: return err; } -static struct proto_ops pfkey_ops = { +static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c3f0b0783453..8171c53bc0ed 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -36,7 +36,7 @@ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; -static struct proto_ops llc_ui_ops; +static const struct proto_ops llc_ui_ops; static int llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); @@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo) /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) { - rc = sock_error(sk); + rc = sock_error(sk); + if (rc) break; - } rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -960,7 +959,7 @@ out: static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } /** @@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops llc_ui_ops = { +static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cba63729313d..e10512e229b6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid) goto out_unlock; INIT_HLIST_NODE(&inst->hlist); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f28460b61e47..55afdda3d940 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid) atomic_set(&inst->id_sequence, 0); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 96020d7087e8..7849cac14d3a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) return 0; } -static struct proto_ops netlink_ops; +static const struct proto_ops netlink_ops; static int netlink_insert(struct sock *sk, u32 pid) { @@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&netlink_chain, nb); } -static struct proto_ops netlink_ops = { +static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 287cfcc56951..3b1378498d50 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -441,7 +441,7 @@ errout: } static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, - int seq, int cmd) + int seq, u8 cmd) { struct sk_buff *skb; int err; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index e5d82d711cae..63b0e4afeb33 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,7 +63,7 @@ static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); -static struct proto_ops nr_proto_ops; +static const struct proto_ops nr_proto_ops; /* * Socket removal during an interrupt is now safe. @@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; int ret; - lock_sock(sk); switch (cmd) { case TIOCOUTQ: { long amount; + + lock_sock(sk); amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amount < 0) amount = 0; @@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: { struct sk_buff *skb; long amount = 0L; + + lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; @@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCGSTAMP: + lock_sock(sk); ret = sock_get_timestamp(sk, argp); release_sock(sk); return ret; @@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: - release_sock(sk); return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: - release_sock(sk); if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: - release_sock(sk); - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } - release_sock(sk); return 0; } @@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops nr_proto_ops = { +static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, diff --git a/net/nonet.c b/net/nonet.c index e5241dceaa57..1230f0ae832e 100644 --- a/net/nonet.c +++ b/net/nonet.c @@ -14,11 +14,6 @@ #include <linux/init.h> #include <linux/kernel.h> -void __init sock_init(void) -{ - printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n"); -} - static int sock_no_open(struct inode *irrelevant, struct file *dontcare) { return -ENXIO; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3e2462760413..f69e5ed9bd06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk) } -static struct proto_ops packet_ops; +static const struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt; +static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, #endif default: - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } return 0; } @@ -1784,7 +1784,7 @@ out: #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt = { +static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, @@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = { }; #endif -static struct proto_ops packet_ops = { +static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 829fdbc4400b..63090be2315a 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } return 0; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 82fb07aa06a5..ba5283204837 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -25,7 +25,7 @@ #include <net/pkt_sched.h> -#define VERSION "1.1" +#define VERSION "1.2" /* Network Emulation Queuing algorithm. ==================================== @@ -65,11 +65,12 @@ struct netem_sched_data { u32 jitter; u32 duplicate; u32 reorder; + u32 corrupt; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor, reorder_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { u32 size; @@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } + /* + * Randomized packet corruption. + * Make copy if needed since we are modifying + * If packet is going to be hardware checksummed, then + * do it now in software before we mangle it. + */ + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) + || (skb->ip_summed == CHECKSUM_HW + && skb_checksum_help(skb, 0))) { + sch->qstats.drops++; + return NET_XMIT_DROP; + } + + skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + } + if (q->gap == 0 /* not doing reordering */ || q->counter < q->gap /* inside last reordering gap */ || q->reorder < get_crandom(&q->reorder_cor)) { @@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corrupt *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->corrupt = r->probability; + init_crandom(&q->corrupt_cor, r->correlation); + return 0; +} + +/* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + if (tb[TCA_NETEM_REORDER-1]) { ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); if (ret) return ret; } - } + if (tb[TCA_NETEM_CORRUPT-1]) { + ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); + if (ret) + return ret; + } + } return 0; } @@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; + struct tc_netem_corrupt corrupt; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) reorder.correlation = q->reorder_cor.rho; RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + corrupt.probability = q->corrupt; + corrupt.correlation = q->corrupt_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rta->rta_len = skb->tail - b; return skb->len; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 6cf0342706b5..c4a2a8c4c339 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -22,6 +22,7 @@ #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dec68a604773..9d05e13e92f6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) * 1000; - asoc->pmtu = 0; asoc->frag_point = 0; /* Set the association max_retrans and RTO values from the @@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->overall_error_count = 0; + /* Initialize the association's heartbeat interval based on the + * sock configured value. + */ + asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + + /* Initialize path max retrans value. */ + asoc->pathmaxrxt = sp->pathmaxrxt; + + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + + /* Set association default SACK delay */ + asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); + + /* Set the association default flags controlling + * Heartbeat, SACK delay, and Path MTU Discovery. + */ + asoc->param_flags = sp->param_flags; + /* Initialize the maximum mumber of new data packets that can be sent * in a burst. */ @@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = - SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; @@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, sctp_transport_set_owner(peer, asoc); + /* Initialize the peer's heartbeat interval based on the + * association configured value. + */ + peer->hbinterval = asoc->hbinterval; + + /* Set the path max_retrans. */ + peer->pathmaxrxt = asoc->pathmaxrxt; + + /* Initialize the peer's SACK delay timeout based on the + * association configured value. + */ + peer->sackdelay = asoc->sackdelay; + + /* Enable/disable heartbeat, SACK delay, and path MTU discovery + * based on association setting. + */ + peer->param_flags = asoc->param_flags; + /* Initialize the pmtu of the transport. */ - sctp_transport_pmtu(peer); + if (peer->param_flags & SPP_PMTUD_ENABLE) + sctp_transport_pmtu(peer); + else if (asoc->pathmtu) + peer->pathmtu = asoc->pathmtu; + else + peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ - if (asoc->pmtu) - asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); + if (asoc->pathmtu) + asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); else - asoc->pmtu = peer->pmtu; + asoc->pathmtu = peer->pathmtu; SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " - "%d\n", asoc, asoc->pmtu); + "%d\n", asoc, asoc->pathmtu); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. @@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * (for example, implementations MAY use the size of the * receiver advertised window). */ - peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); + peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set @@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->partial_bytes_acked = 0; peer->flight_size = 0; - /* By default, enable heartbeat for peer address. */ - peer->hb_allowed = 1; - - /* Initialize the peer's heartbeat interval based on the - * sock configured value. - */ - peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval); - - /* Set the path max_retrans. */ - peer->max_retrans = sp->paddrparam.spp_pathmaxrxt; - /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; @@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each(pos, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); - if (!pmtu || (t->pmtu < pmtu)) - pmtu = t->pmtu; + if (!pmtu || (t->pathmtu < pmtu)) + pmtu = t->pathmtu; } if (pmtu) { struct sctp_sock *sp = sctp_sk(asoc->base.sk); - asoc->pmtu = pmtu; + asoc->pathmtu = pmtu; asoc->frag_point = sctp_frag_point(sp, pmtu); } SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", - __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); + __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point); } /* Should we send a SACK to update our peer? */ @@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc) case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= - min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) + min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu))) return 1; break; default: diff --git a/net/sctp/input.c b/net/sctp/input.c index b24ff2c1aef5..238f1bffa684 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", __FUNCTION__, pmtu, - SCTP_DEFAULT_MINSEGMENT); - pmtu = SCTP_DEFAULT_MINSEGMENT; - } + if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu)) + return; - if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { - t->pmtu = pmtu; + if (t->param_flags & SPP_PMTUD_ENABLE) { + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + printk(KERN_WARNING "%s: Reported pmtu %d too low, " + "using default minimum of %d\n", + __FUNCTION__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment size and disable + * pmtu discovery on this transport. + */ + t->pathmtu = SCTP_DEFAULT_MINSEGMENT; + t->param_flags = (t->param_flags & ~SPP_HB) | + SPP_PMTUD_DISABLE; + } else { + t->pathmtu = pmtu; + } + + /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); - sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } + + /* Retransmit with the new pmtu setting. + * Normally, if PMTU discovery is disabled, an ICMP Fragmentation + * Needed will never be sent, but if a message was sent before + * PMTU discovery was disabled that was larger than the PMTU, it + * would not be fragmented, so it must be re-transmitted fragmented. + */ + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } /* diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fa3be2b8fb5f..15c05165c905 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, return 2; } -static struct proto_ops inet6_seqpacket_ops = { +static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, diff --git a/net/sctp/output.c b/net/sctp/output.c index 931371633464..a40991ef72c9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pmtu) : - (packet->transport->pmtu)); + (packet->transport->asoc->pathmtu) : + (packet->transport->pathmtu)); too_big = (psize + chunk_len > pmtu); @@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (!dst || (dst->obsolete > 1)) { dst_release(dst); sctp_transport_route(tp, NULL, sctp_sk(sk)); - sctp_assoc_sync_pmtu(asoc); + if (asoc->param_flags & SPP_PMTUD_ENABLE) { + sctp_assoc_sync_pmtu(asoc); + } } nskb->dst = dst_clone(tp->dst); @@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet) SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); - (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + if (tp->param_flags & SPP_PMTUD_ENABLE) + (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + else + (*tp->af_specific->sctp_xmit)(nskb, tp, 1); out: packet->size = packet->overhead; @@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * if ((flightsize + Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ - max_burst_bytes = asoc->max_burst * asoc->pmtu; + max_burst_bytes = asoc->max_burst * asoc->pathmtu; if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { transport->cwnd = transport->flight_size + max_burst_bytes; SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " @@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * data will fit or delay in hopes of bundling a full * sized packet. */ - if (len < asoc->pmtu - packet->overhead) { + if (len < asoc->pathmtu - packet->overhead) { retval = SCTP_XMIT_NAGLE_DELAY; goto finish; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f775d78aa59d..de693b43c8ea 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -54,6 +54,7 @@ #include <net/protocol.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/route.h> #include <net/sctp/sctp.h> #include <net/addrconf.h> #include <net/inet_common.h> @@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = { }; /* Socket operations. */ -static struct proto_ops inet_seqpacket_ops = { +static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 823947170a33..2d7d8a5db2ac 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, { __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; + struct sctp_transport *trans = asoc->peer.last_data_from; int error = 0; - if (force) + if (force || + (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || + (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, if (!asoc->peer.sack_needed) { /* We will need a SACK for the next packet. */ asoc->peer.sack_needed = 1; - goto out; + + /* Set the SACK delay timeout based on the + * SACK delay for the last transport + * data was received from, or the default + * for the association. + */ + if (trans) + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + trans->sackdelay; + else + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + asoc->sackdelay; + + /* Restart the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { if (asoc->a_rwnd > asoc->rwnd) asoc->a_rwnd = asoc->rwnd; @@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } -out: + return error; nomem: error = -ENOMEM; @@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE && - (transport->error_count++ >= transport->max_retrans)) { + (transport->error_count++ >= transport->pathmaxrxt)) { SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", " transport IP: port:%d failed.\n", asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 475bfb4972d9..557a7d90b92a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, * HEARTBEAT is sent (see Section 8.3). */ - if (transport->hb_allowed) { + if (transport->param_flags & SPP_HB_ENABLE) { if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type, arg, commands)) @@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, return SCTP_DISPOSITION_DISCARD; } - max_interval = link->hb_interval + link->rto; + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ if (time_after(hbinfo->sent_at, jiffies) || @@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, * document allow. However, an SCTP transmitter MUST NOT be * more aggressive than the following algorithms allow. */ - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } - return SCTP_DISPOSITION_CONSUME; discard_force: @@ -2721,13 +2716,9 @@ discard_force: return SCTP_DISPOSITION_DISCARD; discard_noforce: - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } return SCTP_DISPOSITION_DISCARD; consume: return SCTP_DISPOSITION_CONSUME; @@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, * send another. */ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); return SCTP_DISPOSITION_CONSUME; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9df888e932c5..fc04d185fa33 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1941,107 +1941,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ +int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, + struct sctp_transport *trans, + struct sctp_association *asoc, + struct sctp_sock *sp, + int hb_change, + int pmtud_change, + int sackdelay_change) +{ + int error; + + if (params->spp_flags & SPP_HB_DEMAND && trans) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } + + if (params->spp_hbinterval) { + if (trans) { + trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } + } + + if (hb_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_HB) | hb_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_HB) | hb_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_HB) | hb_change; + } + } + + if (params->spp_pathmtu) { + if (trans) { + trans->pathmtu = params->spp_pathmtu; + sctp_assoc_sync_pmtu(asoc); + } else if (asoc) { + asoc->pathmtu = params->spp_pathmtu; + sctp_frag_point(sp, params->spp_pathmtu); + } else { + sp->pathmtu = params->spp_pathmtu; + } + } + + if (pmtud_change) { + if (trans) { + int update = (trans->param_flags & SPP_PMTUD_DISABLE) && + (params->spp_flags & SPP_PMTUD_ENABLE); + trans->param_flags = + (trans->param_flags & ~SPP_PMTUD) | pmtud_change; + if (update) { + sctp_transport_pmtu(trans); + sctp_assoc_sync_pmtu(asoc); + } + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_PMTUD) | pmtud_change; + } + } + + if (params->spp_sackdelay) { + if (trans) { + trans->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else { + sp->sackdelay = params->spp_sackdelay; + } + } + + if (sackdelay_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } + } + + if (params->spp_pathmaxrxt) { + if (trans) { + trans->pathmaxrxt = params->spp_pathmaxrxt; + } else if (asoc) { + asoc->pathmaxrxt = params->spp_pathmaxrxt; + } else { + sp->pathmaxrxt = params->spp_pathmaxrxt; + } + } + + return 0; +} + static int sctp_setsockopt_peer_addr_params(struct sock *sk, char __user *optval, int optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); int error; + int hb_change, pmtud_change, sackdelay_change; if (optlen != sizeof(struct sctp_paddrparams)) - return -EINVAL; + return - EINVAL; + if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; - /* - * API 7. Socket Options (setting the default value for the endpoint) - * All options that support specific settings on an association by - * filling in either an association id variable or a sockaddr_storage - * SHOULD also support setting of the same value for the entire endpoint - * (i.e. future associations). To accomplish this the following logic is - * used when setting one of these options: - - * c) If neither the sockaddr_storage or association identification is - * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and - * the association identification is 0, the settings are a default - * and to be applied to the endpoint (all future associations). - */ + /* Validate flags and value parameters. */ + hb_change = params.spp_flags & SPP_HB; + pmtud_change = params.spp_flags & SPP_PMTUD; + sackdelay_change = params.spp_flags & SPP_SACKDELAY; + + if (hb_change == SPP_HB || + pmtud_change == SPP_PMTUD || + sackdelay_change == SPP_SACKDELAY || + params.spp_sackdelay > 500 || + (params.spp_pathmtu + && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + return -EINVAL; - /* update default value for endpoint (all future associations) */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - /* Manual heartbeat on an endpoint is invalid. */ - if (0xffffffff == params.spp_hbinterval) + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) return -EINVAL; - else if (params.spp_hbinterval) - sctp_sk(sk)->paddrparam.spp_hbinterval = - params.spp_hbinterval; - if (params.spp_pathmaxrxt) - sctp_sk(sk)->paddrparam.spp_pathmaxrxt = - params.spp_pathmaxrxt; - return 0; } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - /* Applications can enable or disable heartbeats for any peer address - * of an association, modify an address's heartbeat interval, force a - * heartbeat to be sent immediately, and adjust the address's maximum - * number of retransmissions sent before an address is considered - * unreachable. - * - * The value of the heartbeat interval, in milliseconds. A value of - * UINT32_MAX (4294967295), when modifying the parameter, specifies - * that a heartbeat should be sent immediately to the peer address, - * and the current interval should remain unchanged. + /* Heartbeat demand can only be sent on a transport or + * association, but not a socket. */ - if (0xffffffff == params.spp_hbinterval) { - error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); - if (error) - return error; - } else { - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + return -EINVAL; + + /* Process parameters. */ + error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + + if (error) + return error; + + /* If changes are for association, also apply parameters to each + * transport. */ - if (params.spp_hbinterval) { - trans->hb_allowed = 1; - trans->hb_interval = - msecs_to_jiffies(params.spp_hbinterval); - } else - trans->hb_allowed = 0; + if (!trans && asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + } } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - if (params.spp_pathmaxrxt) - trans->max_retrans = params.spp_pathmaxrxt; + return 0; +} + +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_setsockopt_delayed_ack_time(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_assoc_value params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* Validate value parameter. */ + if (params.assoc_value > 500) + return -EINVAL; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (params.assoc_value) { + if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params.assoc_value); + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackdelay = params.assoc_value; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } else { + if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + + /* If change is for association, also apply to each transport. */ + if (asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + if (params.assoc_value) { + trans->sackdelay = + msecs_to_jiffies(params.assoc_value); + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + } + return 0; } @@ -2334,7 +2606,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl /* Update the frag_point of the existing associations. */ list_for_each(pos, &(sp->ep->asocs)) { asoc = list_entry(pos, struct sctp_association, asocs); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); } return 0; @@ -2491,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen); + break; + case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, optval, optlen); break; @@ -2715,8 +2991,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ - sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); - sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; + sp->hbinterval = jiffies_to_msecs(sctp_hb_interval); + sp->pathmaxrxt = sctp_max_retrans_path; + sp->pathmtu = 0; // allow default discovery + sp->sackdelay = sctp_sack_timeout; + sp->param_flags = SPP_HB_ENABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. @@ -2865,7 +3146,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); - status.sstat_primary.spinfo_mtu = transport->pmtu; + status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; @@ -2924,7 +3205,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); - pinfo.spinfo_mtu = transport->pmtu; + pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; @@ -3086,69 +3367,227 @@ out: * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); if (len != sizeof(struct sctp_paddrparams)) return -EINVAL; + if (copy_from_user(¶ms, optval, len)) return -EFAULT; - /* If no association id is specified retrieve the default value - * for the endpoint that will be used for all future associations + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; - params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; - - goto done; + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) { + SCTP_DEBUG_PRINTK("Failed no transport\n"); + return -EINVAL; + } } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) { + SCTP_DEBUG_PRINTK("Failed no association\n"); return -EINVAL; + } - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. - */ - if (!trans->hb_allowed) - params.spp_hbinterval = 0; - else - params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); + if (trans) { + /* Fetch transport values. */ + params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); + params.spp_pathmtu = trans->pathmtu; + params.spp_pathmaxrxt = trans->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = trans->param_flags; + } else if (asoc) { + /* Fetch association values. */ + params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); + params.spp_pathmtu = asoc->pathmtu; + params.spp_pathmaxrxt = asoc->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = asoc->param_flags; + } else { + /* Fetch socket values. */ + params.spp_hbinterval = sp->hbinterval; + params.spp_pathmtu = sp->pathmtu; + params.spp_sackdelay = sp->sackdelay; + params.spp_pathmaxrxt = sp->pathmaxrxt; + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = sp->param_flags; + } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - params.spp_pathmaxrxt = trans->max_retrans; + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + /* Fetch association values. */ + if (asoc->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = jiffies_to_msecs( + asoc->sackdelay); + else + params.assoc_value = 0; + } else { + /* Fetch socket values. */ + if (sp->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = sp->sackdelay; + else + params.assoc_value = 0; + } -done: if (copy_to_user(optval, ¶ms, len)) return -EFAULT; @@ -4015,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_getsockopt_delayed_ack_time(sk, len, optval, + optlen); + break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 268ddaf2dc0f..68d73e2dd155 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->init_sent_count = 0; peer->state = SCTP_ACTIVE; - peer->hb_allowed = 0; + peer->param_flags = SPP_HB_DISABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + peer->hbinterval = 0; /* Initialize the default path max_retrans. */ - peer->max_retrans = sctp_max_retrans_path; + peer->pathmaxrxt = sctp_max_retrans_path; peer->error_count = 0; INIT_LIST_HEAD(&peer->transmitted); @@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport) dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); dst_release(dst); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Caches the dst entry and source address for a transport's destination @@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport, af->get_saddr(asoc, dst, daddr, &transport->saddr); transport->dst = dst; + if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { + return; + } if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). @@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport, opt->pf->af->to_sk_saddr(&transport->saddr, asoc->base.sk); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Hold a reference to a transport. */ @@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; - pmtu = transport->asoc->pmtu; + pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less @@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); - transport->cwnd = transport->asoc->pmtu; + 4*transport->asoc->pathmtu); + transport->cwnd = transport->asoc->pathmtu; break; case SCTP_LOWER_CWND_FAST_RTX: @@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; break; @@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, if ((jiffies - transport->last_time_ecne_reduced) > transport->rtt) { transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } @@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, */ if ((jiffies - transport->last_time_used) > transport->rto) transport->cwnd = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); break; }; @@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, unsigned long sctp_transport_timeout(struct sctp_transport *t) { unsigned long timeout; - timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); + timeout = t->hbinterval + t->rto + sctp_jitter(t->rto); timeout += jiffies; return timeout; } diff --git a/net/socket.c b/net/socket.c index 3145103cdf54..06fa217f58a9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb) kfree(iocb->private); } -/* - * Read data from a socket. ubuf is a user mode pointer. We make sure the user - * area ubuf...ubuf+size-1 is writable before asking the protocol. - */ - -static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, - size_t size, loff_t pos) +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) { - struct sock_iocb *x, siocb; struct socket *sock; int flags; - if (pos != 0) - return -ESPIPE; - if (size==0) /* Match SYS5 behaviour */ - return 0; + sock = file->private_data; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; + flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (more) + flags |= MSG_MORE; + + return sock->ops->sendpage(sock, page, offset, size, flags); +} + +static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, + char __user *ubuf, size_t size, struct sock_iocb *siocb) +{ + if (!is_sync_kiocb(iocb)) { + siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); + if (!siocb) + return NULL; iocb->ki_dtor = sock_aio_dtor; } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_iov.iov_base = ubuf; - x->async_iov.iov_len = size; - flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + siocb->kiocb = iocb; + siocb->async_iov.iov_base = ubuf; + siocb->async_iov.iov_len = size; - return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); + iocb->private = siocb; + return siocb; } +static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; -/* - * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 - * is readable by the user process. - */ + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; -static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, - size_t size, loff_t pos) + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); +} + +static ssize_t sock_readv(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { - struct sock_iocb *x, siocb; - struct socket *sock; - + struct kiocb iocb; + struct sock_iocb siocb; + struct msghdr msg; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + + ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; + if (pos != 0) return -ESPIPE; - if(size==0) /* Match SYS5 behaviour */ + if (count == 0) /* Match SYS5 behaviour */ return 0; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; - iocb->ki_dtor = sock_aio_dtor; - } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (sock->type == SOCK_SEQPACKET) - x->async_msg.msg_flags |= MSG_EOR; - x->async_iov.iov_base = (void __user *)ubuf; - x->async_iov.iov_len = size; - - return __sock_sendmsg(iocb, sock, &x->async_msg, size); + x = alloc_sock_iocb(iocb, ubuf, count, &siocb); + if (!x) + return -ENOMEM; + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } -static ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) +static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) { - struct socket *sock; - int flags; + struct socket *sock = file->private_data; + size_t size = 0; + int i; - sock = file->private_data; + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; - flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (more) - flags |= MSG_MORE; + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + if (sock->type == SOCK_SEQPACKET) + msg->msg_flags |= MSG_EOR; - return sock->ops->sendpage(sock, page, offset, size, flags); + return __sock_sendmsg(iocb, sock, msg, size); } -static int sock_readv_writev(int type, - struct file * file, const struct iovec * iov, - long count, size_t size) +static ssize_t sock_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct msghdr msg; - struct socket *sock; + struct kiocb iocb; + struct sock_iocb siocb; + int ret; - sock = file->private_data; + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = count; - msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} - /* read() does a VERIFY_WRITE */ - if (type == VERIFY_WRITE) - return sock_recvmsg(sock, &msg, size, msg.msg_flags); +static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; - if (sock->type == SOCK_SEQPACKET) - msg.msg_flags |= MSG_EOR; + if (pos != 0) + return -ESPIPE; + if (count == 0) /* Match SYS5 behaviour */ + return 0; - return sock_sendmsg(sock, &msg, size); -} + x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb); + if (!x) + return -ENOMEM; -static ssize_t sock_readv(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_WRITE, - file, vector, count, tot_len); -} - -static ssize_t sock_writev(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_READ, - file, vector, count, tot_len); + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } @@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; default: err = sock->ops->ioctl(sock, cmd, arg); + + /* + * If this ioctl is unknown try to hand it down + * to the NIC driver. + */ + if (err == -ENOIOCTLCMD) + err = dev_ioctl(cmd, argp); break; } return err; @@ -2036,7 +2039,7 @@ int sock_unregister(int family) return 0; } -void __init sock_init(void) +static int __init sock_init(void) { /* * Initialize sock SLAB cache. @@ -2044,12 +2047,10 @@ void __init sock_init(void) sk_init(); -#ifdef SLAB_SKB /* * Initialize skbuff SLAB cache */ skb_init(); -#endif /* * Initialize the protocols module. @@ -2058,15 +2059,19 @@ void __init sock_init(void) init_inodecache(); register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); - /* The real protocol initialization is performed when - * do_initcalls is run. + + /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER netfilter_init(); #endif + + return 0; } +core_initcall(sock_init); /* early initcall */ + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c6a51911e71e..d68eba481291 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_serv *serv = svsk->sk_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; - struct proto_ops *ops; + const struct proto_ops *ops; struct svc_sock *newsvsk; int err, slen; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index acc73ba8bade..5f6ae79b8b16 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -121,7 +121,7 @@ int sysctl_unix_max_dgram_qlen = 10; struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -DEFINE_RWLOCK(unix_table_lock); +DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); /* * SMP locking strategy: - * hash table is protected with rwlock unix_table_lock + * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate rwlock. */ @@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) static inline void unix_remove_socket(struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_remove_socket(sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, @@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, { struct sock *s; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); s = __unix_find_socket_byname(sunname, len, type, hash); if (s) sock_hold(s); - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) struct sock *s; struct hlist_node *node; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; @@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) } s = NULL; found: - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); -static struct proto_ops unix_stream_ops = { +static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_dgram_ops = { +static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_seqpacket_ops = { +static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock) u = unix_sk(sk); u->dentry = NULL; u->mnt = NULL; - rwlock_init(&u->lock); + spin_lock_init(&u->lock); atomic_set(&u->inflight, sock ? 0 : -1); init_MUTEX(&u->readsem); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); @@ -642,12 +642,12 @@ retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(addr->name, addr->len, sock->type, addr->hash)) { - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ if (!(ordernum&0xFF)) yield(); @@ -658,7 +658,7 @@ retry: __unix_remove_socket(sk); u->addr = addr; __unix_insert_socket(&unix_socket_table[addr->hash], sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); err = 0; out: up(&u->readsem); @@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = UNIX_HASH_SIZE; } - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; @@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) __unix_insert_socket(list, sk); out_unlock: - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); out_up: up(&u->readsem); out: @@ -1063,10 +1063,12 @@ restart: /* Set credentials */ sk->sk_peercred = other->sk_peercred; - sock_hold(newsk); - unix_peer(sk) = newsk; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; + sock_hold(newsk); + + smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + unix_peer(sk) = newsk; unix_state_wunlock(sk); @@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } else { sunaddr = NULL; err = -ENOTCONN; - other = unix_peer_get(sk); + other = unix_peer(sk); if (!other) goto out_err; } @@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, other->sk_data_ready(other, size); sent+=size; } - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; @@ -1491,8 +1492,6 @@ pipe_err: send_sig(SIGPIPE,current,0); err = -EPIPE; out_err: - if (other) - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; return sent ? : err; @@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } return err; @@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); } @@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void unix_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6ffc64e1712d..411802bd4d37 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -182,7 +182,7 @@ void unix_gc(void) if (down_trylock(&unix_gc_sem)) return; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); forall_unix_sockets(i, s) { @@ -301,7 +301,7 @@ void unix_gc(void) } u->gc_tree = GC_ORPHAN; } - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* * Here we are. Hitlist is filled. Die. diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 59fec59b2132..7a43ae4721ed 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -181,7 +181,7 @@ struct wanpipe_opt #endif static int sk_count; -extern struct proto_ops wanpipe_ops; +extern const struct proto_ops wanpipe_ops; static unsigned long find_free_critical; static void wanpipe_unlink_driver(struct sock *sk); @@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar #endif default: - return dev_ioctl(cmd,(void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ } @@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr return 0; } -struct proto_ops wanpipe_ops = { +const struct proto_ops wanpipe_ops = { .family = PF_WANPIPE, .owner = THIS_MODULE, .release = wanpipe_release, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 020d73cc8414..16459c7f54b2 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); -static struct proto_ops x25_proto_ops; +static const struct proto_ops x25_proto_ops; static struct x25_address null_x25_address = {" "}; @@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } @@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d19e274b9c4a..64a447375fdb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -10,7 +10,7 @@ * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor - * + * */ #include <asm/bug.h> @@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer)) BUG(); + security_xfrm_policy_free(policy); kfree(policy); } EXPORT_SYMBOL(__xfrm_policy_destroy); @@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { - if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { + if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 && + xfrm_sec_ctx_match(pol->security, policy->security)) { if (excl) { write_unlock_bh(&xfrm_policy_lock); return -EEXIST; @@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete) +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete) { struct xfrm_policy *pol, **p; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { - if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { + if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) && + (xfrm_sec_ctx_match(ctx, pol->security))) { xfrm_pol_hold(pol); if (delete) *p = pol->next; @@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, } return pol; } -EXPORT_SYMBOL(xfrm_policy_bysel); +EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) { @@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk); /* Find policy to apply to this flow. */ -static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, +static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp) { struct xfrm_policy *pol; @@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, continue; match = xfrm_selector_match(sel, fl, family); + if (match) { - xfrm_pol_hold(pol); - break; + if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) { + xfrm_pol_hold(pol); + break; + } } } read_unlock_bh(&xfrm_policy_lock); @@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, *obj_refp = &pol->refcnt; } -static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + }; +} + +static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid) { struct xfrm_policy *pol; read_lock_bh(&xfrm_policy_lock); if ((pol = sk->sk_policy[dir]) != NULL) { - int match = xfrm_selector_match(&pol->selector, fl, + int match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); + int err = 0; + if (match) + err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir)); + + if (match && !err) xfrm_pol_hold(pol); else pol = NULL; @@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir) if (newp) { newp->selector = old->selector; + if (security_xfrm_policy_clone(old, newp)) { + kfree(newp); + return NULL; /* ENOMEM */ + } newp->lft = old->lft; newp->curlft = old->curlft; newp->action = old->action; @@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } -static inline int policy_to_flow_dir(int dir) -{ - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - switch (dir) { - default: - case XFRM_POLICY_IN: - return FLOW_DIR_IN; - case XFRM_POLICY_OUT: - return FLOW_DIR_OUT; - case XFRM_POLICY_FWD: - return FLOW_DIR_FWD; - }; -} static int stale_bundle(struct dst_entry *dst); @@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, int err; u32 genid; u16 family = dst_orig->ops->family; + u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + u32 sk_sid = security_sk_sid(sk, fl, dir); restart: genid = atomic_read(&flow_cache_genid); policy = NULL; if (sk && sk->sk_policy[1]) - policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid); if (!policy) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) return 0; - policy = flow_cache_lookup(fl, family, - policy_to_flow_dir(XFRM_POLICY_OUT), + policy = flow_cache_lookup(fl, sk_sid, family, dir, xfrm_policy_lookup); } @@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, { struct xfrm_policy *pol; struct flowi fl; + u8 fl_dir = policy_to_flow_dir(dir); + u32 sk_sid; if (_decode_session(skb, &fl, family) < 0) return 0; + sk_sid = security_sk_sid(sk, &fl, fl_dir); + /* First, check used SA against their selectors. */ if (skb->sp) { int i; for (i=skb->sp->len-1; i>=0; i--) { - struct sec_decap_state *xvec = &(skb->sp->x[i]); + struct sec_decap_state *xvec = &(skb->sp->x[i]); if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) return 0; @@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; if (sk && sk->sk_policy[dir]) - pol = xfrm_sk_policy_lookup(sk, dir, &fl); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid); if (!pol) - pol = flow_cache_lookup(&fl, family, - policy_to_flow_dir(dir), + pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir, xfrm_policy_lookup); if (!pol) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 479effc97666..e12d0be5f976 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -10,7 +10,7 @@ * Split up af-specific functions * Derek Atkins <derek@ihtfp.com> * Add UDP Encapsulation - * + * */ #include <linux/workqueue.h> @@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + security_xfrm_state_free(x); kfree(x); } @@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, selector. */ if (x->km.state == XFRM_STATE_VALID) { - if (!xfrm_selector_match(&x->sel, fl, family)) + if (!xfrm_selector_match(&x->sel, fl, family) || + !xfrm_sec_ctx_match(pol->security, x->security)) continue; if (!best || best->km.dying > x->km.dying || @@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, acquire_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, family)) + if (xfrm_selector_match(&x->sel, fl, family) && + xfrm_sec_ctx_match(pol->security, x->security)) error = -ESRCH; } } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0cdd9a07e043..92e2b804c606 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -7,7 +7,7 @@ * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support - * + * */ #include <linux/module.h> @@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma) return 0; } + +static inline int verify_sec_ctx_len(struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1]; + struct xfrm_user_sec_ctx *uctx; + int len = 0; + + if (!rt) + return 0; + + if (rt->rta_len < sizeof(*uctx)) + return -EINVAL; + + uctx = RTA_DATA(rt); + + if (uctx->ctx_len > PAGE_SIZE) + return -EINVAL; + + len += sizeof(struct xfrm_user_sec_ctx); + len += uctx->ctx_len; + + if (uctx->len != len) + return -EINVAL; + + return 0; +} + + static int verify_newsa_info(struct xfrm_usersa_info *p, struct rtattr **xfrma) { @@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; if ((err = verify_encap_tmpl(xfrma))) goto out; + if ((err = verify_sec_ctx_len(xfrma))) + goto out; err = -EINVAL; switch (p->mode) { @@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a return 0; } + +static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + int len = 0; + + if (xfrm_ctx) { + len += sizeof(struct xfrm_user_sec_ctx); + len += xfrm_ctx->ctx_len; + } + return len; +} + +static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg) +{ + struct xfrm_user_sec_ctx *uctx; + + if (!u_arg) + return 0; + + uctx = RTA_DATA(u_arg); + return security_xfrm_state_alloc(x, uctx); +} + static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { memcpy(&x->id, &p->id, sizeof(x->id)); @@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, if (err) goto error; + if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1]))) + goto error; + x->km.seq = p->seq; return x; @@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) int err; struct km_event c; - err = verify_newsa_info(p, (struct rtattr **) xfrma); + err = verify_newsa_info(p, (struct rtattr **)xfrma); if (err) return err; - x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); + x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err); if (!x) return err; @@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) if (x->encap) RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + if (x->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + x->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = x->security->ctx_doi; + uctx->ctx_alg = x->security->ctx_alg; + uctx->ctx_len = x->security->ctx_len; + memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len); + } nlh->nlmsg_len = skb->tail - b; out: sp->this_idx++; @@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) return verify_policy_dir(p->dir); } +static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1]; + struct xfrm_user_sec_ctx *uctx; + + if (!rt) + return 0; + + uctx = RTA_DATA(rt); + return security_xfrm_policy_alloc(pol, uctx); +} + static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, int nr) { @@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, } copy_from_user_policy(xp, p); - err = copy_from_user_tmpl(xp, xfrma); + + if (!(err = copy_from_user_tmpl(xp, xfrma))) + err = copy_from_user_sec_ctx(xp, xfrma); + if (err) { *errp = err; kfree(xp); @@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr err = verify_newpolicy_info(p); if (err) return err; + err = verify_sec_ctx_len((struct rtattr **)xfrma); + if (err) + return err; - xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); + xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err); if (!xp) return err; @@ -761,6 +849,27 @@ rtattr_failure: return -1; } +static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb) +{ + if (xp->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + xp->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = xp->security->ctx_doi; + uctx->ctx_alg = xp->security->ctx_alg; + uctx->ctx_len = xp->security->ctx_len; + memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len); + } + return 0; + + rtattr_failure: + return -1; +} + static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct xfrm_dump_info *sp = ptr; @@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr copy_to_user_policy(xp, p, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; out: @@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr if (p->index) xp = xfrm_policy_byid(p->dir, p->index, delete); - else - xp = xfrm_policy_bysel(p->dir, &p->sel, delete); + else { + struct rtattr **rtattrs = (struct rtattr **)xfrma; + struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1]; + struct xfrm_policy tmp; + + err = verify_sec_ctx_len(rtattrs); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct xfrm_policy)); + if (rt) { + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + if ((err = security_xfrm_policy_alloc(&tmp, uctx))) + return err; + } + xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete); + security_xfrm_policy_free(&tmp); + } if (xp == NULL) return -ENOENT; @@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, copy_to_user_policy(xp, &upe->pol, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; upe->hard = !!hard; nlh->nlmsg_len = skb->tail - b; @@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; |