diff options
Diffstat (limited to 'net/openvswitch')
-rw-r--r-- | net/openvswitch/Kconfig | 1 | ||||
-rw-r--r-- | net/openvswitch/Makefile | 1 | ||||
-rw-r--r-- | net/openvswitch/actions.c | 126 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 12 | ||||
-rw-r--r-- | net/openvswitch/conntrack.h | 7 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 90 | ||||
-rw-r--r-- | net/openvswitch/datapath.h | 39 | ||||
-rw-r--r-- | net/openvswitch/dp_notify.c | 4 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 51 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 7 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.c | 405 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.h | 5 | ||||
-rw-r--r-- | net/openvswitch/meter.c | 597 | ||||
-rw-r--r-- | net/openvswitch/meter.h | 54 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.c | 3 |
15 files changed, 1350 insertions, 52 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ce947292ae77..2650205cdaf9 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -14,6 +14,7 @@ config OPENVSWITCH select MPLS select NET_MPLS_GSO select DST_CACHE + select NET_NSH ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 299f4476cf44..41109c326f3a 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -12,6 +12,7 @@ openvswitch-y := \ flow.o \ flow_netlink.o \ flow_table.o \ + meter.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a54a556fcdb5..30a5df27116e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -43,6 +43,7 @@ #include "flow.h" #include "conntrack.h" #include "vport.h" +#include "flow_netlink.h" struct deferred_action { struct sk_buff *skb; @@ -380,6 +381,38 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, return 0; } +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key, + const struct nshhdr *nh) +{ + int err; + + err = nsh_push(skb, nh); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + +static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + err = nsh_pop(skb); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + if (skb->protocol == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + else + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, __be32 addr, __be32 new_addr) { @@ -602,6 +635,69 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } +static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + struct nshhdr *nh; + size_t length; + int err; + u8 flags; + u8 ttl; + int i; + + struct ovs_key_nsh key; + struct ovs_key_nsh mask; + + err = nsh_key_from_nlattr(a, &key, &mask); + if (err) + return err; + + /* Make sure the NSH base header is there */ + if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) + return -ENOMEM; + + nh = nsh_hdr(skb); + length = nsh_hdr_len(nh); + + /* Make sure the whole NSH header is there */ + err = skb_ensure_writable(skb, skb_network_offset(skb) + + length); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + skb_postpull_rcsum(skb, nh, length); + flags = nsh_get_flags(nh); + flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); + flow_key->nsh.base.flags = flags; + ttl = nsh_get_ttl(nh); + ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); + flow_key->nsh.base.ttl = ttl; + nsh_set_flags_and_ttl(nh, flags, ttl); + nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, + mask.base.path_hdr); + flow_key->nsh.base.path_hdr = nh->path_hdr; + switch (nh->mdtype) { + case NSH_M_TYPE1: + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { + nh->md1.context[i] = + OVS_MASKED(nh->md1.context[i], key.context[i], + mask.context[i]); + } + memcpy(flow_key->nsh.context, nh->md1.context, + sizeof(nh->md1.context)); + break; + case NSH_M_TYPE2: + memset(flow_key->nsh.context, 0, + sizeof(flow_key->nsh.context)); + break; + default: + return -EINVAL; + } + skb_postpush_rcsum(skb, nh, length); + return 0; +} + /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -1024,6 +1120,10 @@ static int execute_masked_set_action(struct sk_buff *skb, get_mask(a, struct ovs_key_ethernet *)); break; + case OVS_KEY_ATTR_NSH: + err = set_nsh(skb, flow_key, a); + break; + case OVS_KEY_ATTR_IPV4: err = set_ipv4(skb, flow_key, nla_data(a), get_mask(a, struct ovs_key_ipv4 *)); @@ -1203,6 +1303,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return err == -EINPROGRESS ? 0 : err; break; + case OVS_ACTION_ATTR_CT_CLEAR: + err = ovs_ct_clear(skb, key); + break; + case OVS_ACTION_ATTR_PUSH_ETH: err = push_eth(skb, key, nla_data(a)); break; @@ -1210,6 +1314,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_POP_ETH: err = pop_eth(skb, key); break; + + case OVS_ACTION_ATTR_PUSH_NSH: { + u8 buffer[NSH_HDR_MAX_LEN]; + struct nshhdr *nh = (struct nshhdr *)buffer; + + err = nsh_hdr_from_nlattr(nla_data(a), nh, + NSH_HDR_MAX_LEN); + if (unlikely(err)) + break; + err = push_nsh(skb, key, nh); + break; + } + + case OVS_ACTION_ATTR_POP_NSH: + err = pop_nsh(skb, key); + break; + + case OVS_ACTION_ATTR_METER: + if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { + consume_skb(skb); + return 0; + } } if (unlikely(err)) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d558e882ca0c..b27c5c6d9cab 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -752,6 +752,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } } /* Non-ICMP, fall thru to initialize if needed. */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. @@ -1129,6 +1130,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, return err; } +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) +{ + if (skb_nfct(skb)) { + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key); + } + + return 0; +} + static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, const struct sw_flow_key *key, bool log) { diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index bc7efd1867ab..399dfdd2c4f9 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); int ovs_ct_put_key(const struct sw_flow_key *swkey, @@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, return -ENOTSUPP; } +static inline int ovs_ct_clear(struct sk_buff *skb, + struct sw_flow_key *key) +{ + return -ENOTSUPP; +} + static inline void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c3aec6227c91..0dab33fb9844 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -55,6 +55,7 @@ #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" +#include "meter.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -142,35 +143,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); -/* Must be called with rcu_read_lock. */ -static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) -{ - struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - - if (dev) { - struct vport *vport = ovs_internal_dev_get_vport(dev); - if (vport) - return vport->dp; - } - - return NULL; -} - -/* The caller must hold either ovs_mutex or rcu_read_lock to keep the - * returned dp pointer valid. - */ -static inline struct datapath *get_dp(struct net *net, int dp_ifindex) -{ - struct datapath *dp; - - WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); - rcu_read_lock(); - dp = get_dp_rcu(net, dp_ifindex); - rcu_read_unlock(); - - return dp; -} - /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -203,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); + ovs_meters_exit(dp); kfree(dp); } @@ -1601,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_meters_init(dp); + if (err) + goto err_destroy_ports_array; + /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; @@ -1629,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - goto err_destroy_ports_array; + goto err_destroy_meters; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1644,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_ports_array: +err_destroy_meters: ovs_unlock(); + ovs_meters_exit(dp); +err_destroy_ports_array: kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); @@ -1848,7 +1827,8 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, - u32 portid, u32 seq, u32 flags, u8 cmd) + struct net *net, u32 portid, u32 seq, + u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; @@ -1864,9 +1844,17 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - ovs_vport_name(vport))) + ovs_vport_name(vport)) || + nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; + if (!net_eq(net, dev_net(vport->dev))) { + int id = peernet2id_alloc(net, dev_net(vport->dev)); + + if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) + goto nla_put_failure; + } + ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), &vport_stats, @@ -1896,8 +1884,8 @@ static struct sk_buff *ovs_vport_cmd_alloc_info(void) } /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ -struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, - u32 seq, u8 cmd) +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, + u32 portid, u32 seq, u8 cmd) { struct sk_buff *skb; int retval; @@ -1906,7 +1894,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, if (!skb) return ERR_PTR(-ENOMEM); - retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); + retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd); BUG_ON(retval < 0); return skb; @@ -1920,6 +1908,8 @@ static struct vport *lookup_vport(struct net *net, struct datapath *dp; struct vport *vport; + if (a[OVS_VPORT_ATTR_IFINDEX]) + return ERR_PTR(-EOPNOTSUPP); if (a[OVS_VPORT_ATTR_NAME]) { vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) @@ -1944,6 +1934,7 @@ static struct vport *lookup_vport(struct net *net, return vport; } else return ERR_PTR(-EINVAL); + } /* Called with ovs_mutex */ @@ -1983,6 +1974,8 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; + if (a[OVS_VPORT_ATTR_IFINDEX]) + return -EOPNOTSUPP; port_no = a[OVS_VPORT_ATTR_PORT_NO] ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; @@ -2032,8 +2025,9 @@ restart: goto exit_unlock_free; } - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW); if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) update_headroom(dp); @@ -2090,8 +2084,9 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW); BUG_ON(err < 0); ovs_unlock(); @@ -2128,8 +2123,9 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_DEL); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_DEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ @@ -2169,8 +2165,9 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW); BUG_ON(err < 0); rcu_read_unlock(); @@ -2202,6 +2199,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, + sock_net(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, @@ -2228,6 +2226,8 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, + [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, }; static const struct genl_ops dp_vport_genl_ops[] = { @@ -2273,6 +2273,7 @@ static struct genl_family * const dp_genl_families[] = { &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, + &dp_meter_genl_family, }; static void dp_unregister_genl(int n_families) @@ -2453,3 +2454,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 480600649d0b..523d65526766 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -30,6 +30,8 @@ #include "conntrack.h" #include "flow.h" #include "flow_table.h" +#include "meter.h" +#include "vport-internal_dev.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -91,6 +93,9 @@ struct datapath { u32 user_features; u32 max_headroom; + + /* Switch meters. */ + struct hlist_head *meters; }; /** @@ -190,6 +195,36 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n return ovs_lookup_vport(dp, port_no); } +/* Must be called with rcu_read_lock. */ +static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); + + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + + if (vport) + return vport->dp; + } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); + rcu_read_unlock(); + + return dp; +} + extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; @@ -200,8 +235,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *, uint32_t cutlen); const char *ovs_dp_name(const struct datapath *dp); -struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, - u8 cmd); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, + u32 portid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *, struct sw_flow_key *); diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 653d073bae45..f3ee2f2825c0 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -30,8 +30,8 @@ static void dp_detach_port_notify(struct vport *vport) struct datapath *dp; dp = vport->dp; - notify = ovs_vport_cmd_build_info(vport, 0, 0, - OVS_VPORT_CMD_DEL); + notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), + 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8c94cef25a72..864ddb1e3642 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -46,6 +46,7 @@ #include <net/ipv6.h> #include <net/mpls.h> #include <net/ndisc.h> +#include <net/nsh.h> #include "conntrack.h" #include "datapath.h" @@ -490,6 +491,52 @@ invalid: return 0; } +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct nshhdr *nh; + unsigned int nh_ofs = skb_network_offset(skb); + u8 version, length; + int err; + + err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + version = nsh_get_ver(nh); + length = nsh_hdr_len(nh); + + if (version != 0) + return -EINVAL; + + err = check_header(skb, nh_ofs + length); + if (unlikely(err)) + return err; + + nh = nsh_hdr(skb); + key->nsh.base.flags = nsh_get_flags(nh); + key->nsh.base.ttl = nsh_get_ttl(nh); + key->nsh.base.mdtype = nh->mdtype; + key->nsh.base.np = nh->np; + key->nsh.base.path_hdr = nh->path_hdr; + switch (key->nsh.base.mdtype) { + case NSH_M_TYPE1: + if (length != NSH_M_TYPE1_LEN) + return -EINVAL; + memcpy(key->nsh.context, nh->md1.context, + sizeof(nh->md1)); + break; + case NSH_M_TYPE2: + memset(key->nsh.context, 0, + sizeof(nh->md1)); + break; + default: + return -EINVAL; + } + + return 0; +} + /** * key_extract - extracts a flow key from an Ethernet frame. * @skb: sk_buff that contains the frame, with skb->data pointing to the @@ -735,6 +782,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->tp, 0, sizeof(key->tp)); } } + } else if (key->eth.type == htons(ETH_P_NSH)) { + error = parse_nsh(skb, key); + if (error) + return error; } return 0; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1875bba4f865..c670dd24b8b7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -35,6 +35,7 @@ #include <net/inet_ecn.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> +#include <net/nsh.h> struct sk_buff; @@ -66,6 +67,11 @@ struct vlan_head { (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) +struct ovs_key_nsh { + struct ovs_nsh_key_base base; + __be32 context[NSH_MD1_CONTEXT_SIZE]; +}; + struct sw_flow_key { u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; @@ -143,6 +149,7 @@ struct sw_flow_key { } nd; }; } ipv6; + struct ovs_key_nsh nsh; /* network service header */ }; struct { /* Connection tracking fields not packed above. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e8eb427ce6d1..dc424798ba6f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,8 @@ #include <net/ndisc.h> #include <net/mpls.h> #include <net/vxlan.h> +#include <net/tun_proto.h> +#include <net/erspan.h> #include "flow_netlink.h" @@ -75,16 +77,20 @@ static bool actions_may_change_flow(const struct nlattr *actions) break; case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_CT_CLEAR: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_POP_ETH: case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_NSH: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_ETH: case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_PUSH_NSH: case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_METER: default: return true; } @@ -173,7 +179,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) | (1 << OVS_KEY_ATTR_ND) - | (1 << OVS_KEY_ATTR_MPLS)); + | (1 << OVS_KEY_ATTR_MPLS) + | (1 << OVS_KEY_ATTR_NSH)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -282,6 +289,14 @@ static bool match_validate(const struct sw_flow_match *match, } } + if (match->key->eth.type == htons(ETH_P_NSH)) { + key_expected |= 1 << OVS_KEY_ATTR_NSH; + if (match->mask && + match->mask->key.eth.type == htons(0xffff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_NSH; + } + } + if ((key_attrs & key_expected) != key_expected) { /* Key attributes check failed. */ OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", @@ -319,7 +334,21 @@ size_t ovs_tun_key_attr_size(void) * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ +} + +static size_t ovs_nsh_key_attr_size(void) +{ + /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */ + /* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are + * mutually exclusive, so the bigger one can cover + * the small one. + */ + + nla_total_size(NSH_CTX_HDRS_MAX_LEN); } size_t ovs_key_attr_size(void) @@ -327,7 +356,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -341,6 +370,8 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ + + nla_total_size(0) /* OVS_KEY_ATTR_NSH */ + + ovs_nsh_key_attr_size() + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -371,6 +402,14 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, +}; + +static const struct ovs_len_tbl +ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = { + [OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) }, + [OVS_NSH_KEY_ATTR_MD1] = { .len = sizeof(struct ovs_nsh_key_md1) }, + [OVS_NSH_KEY_ATTR_MD2] = { .len = OVS_ATTR_VARIABLE }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -405,6 +444,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, + [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, + .next = ovs_nsh_key_attr_lens, }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -593,6 +634,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } +static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + struct erspan_metadata opts; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + memset(&opts, 0, sizeof(opts)); + opts.index = nla_get_be32(attr); + + /* Index has only 20-bit */ + if (ntohl(opts.index) & ~INDEX_MASK) { + OVS_NLERR(log, "ERSPAN index number %x too large.", + ntohl(opts.index)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + + return 0; +} + static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -700,6 +768,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_ERSPAN_OPT; + opts_type = type; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -824,6 +905,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + ((struct erspan_metadata *)tun_opts)->index)) + return -EMSGSIZE; } return 0; @@ -1179,6 +1264,221 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return 0; } +int nsh_hdr_from_nlattr(const struct nlattr *attr, + struct nshhdr *nh, size_t size) +{ + struct nlattr *a; + int rem; + u8 flags = 0; + u8 ttl = 0; + int mdlen = 0; + + /* validate_nsh has check this, so we needn't do duplicate check here + */ + if (size < NSH_BASE_HDR_LEN) + return -ENOBUFS; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + flags = base->flags; + ttl = base->ttl; + nh->np = base->np; + nh->mdtype = base->mdtype; + nh->path_hdr = base->path_hdr; + break; + } + case OVS_NSH_KEY_ATTR_MD1: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md1, nla_data(a), mdlen); + break; + + case OVS_NSH_KEY_ATTR_MD2: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md2, nla_data(a), mdlen); + break; + + default: + return -EINVAL; + } + } + + /* nsh header length = NSH_BASE_HDR_LEN + mdlen */ + nh->ver_flags_ttl_len = 0; + nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen); + + return 0; +} + +int nsh_key_from_nlattr(const struct nlattr *attr, + struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) +{ + struct nlattr *a; + int rem; + + /* validate_nsh has check this, so we needn't do duplicate check here + */ + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + const struct ovs_nsh_key_base *base_mask = base + 1; + + nsh->base = *base; + nsh_mask->base = *base_mask; + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; + + memcpy(nsh->context, md1->context, sizeof(*md1)); + memcpy(nsh_mask->context, md1_mask->context, + sizeof(*md1_mask)); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + /* Not supported yet */ + return -ENOTSUPP; + default: + return -EINVAL; + } + } + + return 0; +} + +static int nsh_key_put_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool is_push_nsh, bool log) +{ + struct nlattr *a; + int rem; + bool has_base = false; + bool has_md1 = false; + bool has_md2 = false; + u8 mdtype = 0; + int mdlen = 0; + + if (WARN_ON(is_push_nsh && is_mask)) + return -EINVAL; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int i; + + if (type > OVS_NSH_KEY_ATTR_MAX) { + OVS_NLERR(log, "nsh attr %d is out of range max %d", + type, OVS_NSH_KEY_ATTR_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_nsh_key_attr_lens[type].len)) { + OVS_NLERR( + log, + "nsh attr %d has unexpected len %d expected %d", + type, + nla_len(a), + ovs_nsh_key_attr_lens[type].len + ); + return -EINVAL; + } + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + has_base = true; + mdtype = base->mdtype; + SW_FLOW_KEY_PUT(match, nsh.base.flags, + base->flags, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.ttl, + base->ttl, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.mdtype, + base->mdtype, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.np, + base->np, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.path_hdr, + base->path_hdr, is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + + has_md1 = true; + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) + SW_FLOW_KEY_PUT(match, nsh.context[i], + md1->context[i], is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + if (!is_push_nsh) /* Not supported MD type 2 yet */ + return -ENOTSUPP; + + has_md2 = true; + mdlen = nla_len(a); + if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) { + OVS_NLERR( + log, + "Invalid MD length %d for MD type %d", + mdlen, + mdtype + ); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown nsh attribute %d", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem); + return -EINVAL; + } + + if (has_md1 && has_md2) { + OVS_NLERR( + 1, + "invalid nsh attribute: md1 and md2 are exclusive." + ); + return -EINVAL; + } + + if (!is_mask) { + if ((has_md1 && mdtype != NSH_M_TYPE1) || + (has_md2 && mdtype != NSH_M_TYPE2)) { + OVS_NLERR(1, "nsh attribute has unmatched MD type %d.", + mdtype); + return -EINVAL; + } + + if (is_push_nsh && + (!has_base || (!has_md1 && !has_md2))) { + OVS_NLERR( + 1, + "push_nsh: missing base or metadata attributes" + ); + return -EINVAL; + } + } + + return 0; +} + static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 attrs, const struct nlattr **a, bool is_mask, bool log) @@ -1306,6 +1606,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_NSH)) { + if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match, + is_mask, false, log) < 0) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_NSH); + } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { const struct ovs_key_mpls *mpls_key; @@ -1622,6 +1929,34 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, return 0; } +static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_KEY_ATTR_NSH); + if (!start) + return -EMSGSIZE; + + if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base)) + goto nla_put_failure; + + if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) { + if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, + sizeof(nsh->context), nsh->context)) + goto nla_put_failure; + } + + /* Don't support MD type 2 yet */ + + nla_nest_end(skb, start); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int __ovs_nla_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, bool is_mask, struct sk_buff *skb) @@ -1750,6 +2085,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv6_key->ipv6_tclass = output->ip.tos; ipv6_key->ipv6_hlimit = output->ip.ttl; ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_NSH)) { + if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) + goto nla_put_failure; } else if (swkey->eth.type == htons(ETH_P_ARP) || swkey->eth.type == htons(ETH_P_RARP)) { struct ovs_key_arp *arp_key; @@ -2195,6 +2533,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + break; } }; @@ -2242,6 +2582,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } +static bool validate_nsh(const struct nlattr *attr, bool is_mask, + bool is_push_nsh, bool log) +{ + struct sw_flow_match match; + struct sw_flow_key key; + int ret = 0; + + ovs_match_init(&match, &key, true, NULL); + ret = nsh_key_put_from_nlattr(attr, &match, is_mask, + is_push_nsh, log); + return !ret; +} + /* Return false if there are any non-masked bits set. * Mask follows data immediately, before any netlink padding. */ @@ -2384,6 +2737,13 @@ static int validate_set(const struct nlattr *a, break; + case OVS_KEY_ATTR_NSH: + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + if (!validate_nsh(nla_data(a), masked, false, log)) + return -EINVAL; + break; + default: return -EINVAL; } @@ -2479,9 +2839,13 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_CT_CLEAR] = 0, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), [OVS_ACTION_ATTR_POP_ETH] = 0, + [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, + [OVS_ACTION_ATTR_POP_NSH] = 0, + [OVS_ACTION_ATTR_METER] = sizeof(u32), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2620,6 +2984,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_CT_CLEAR: + break; + case OVS_ACTION_ATTR_PUSH_ETH: /* Disallow pushing an Ethernet header if one * is already present */ @@ -2636,6 +3003,38 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, mac_proto = MAC_PROTO_ETHERNET; break; + case OVS_ACTION_ATTR_PUSH_NSH: + if (mac_proto != MAC_PROTO_ETHERNET) { + u8 next_proto; + + next_proto = tun_p_from_eth_p(eth_type); + if (!next_proto) + return -EINVAL; + } + mac_proto = MAC_PROTO_NONE; + if (!validate_nsh(nla_data(a), false, true, true)) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_POP_NSH: { + __be16 inner_proto; + + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + inner_proto = tun_p_to_eth_p(key->nsh.base.np); + if (!inner_proto) + return -EINVAL; + if (key->nsh.base.np == TUN_P_ETHERNET) + mac_proto = MAC_PROTO_ETHERNET; + else + mac_proto = MAC_PROTO_NONE; + break; + } + + case OVS_ACTION_ATTR_METER: + /* Non-existent meters are simply ignored. */ + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 929c665ac3aa..6657606b2b47 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -79,4 +79,9 @@ int ovs_nla_put_actions(const struct nlattr *attr, void ovs_nla_free_flow_actions(struct sw_flow_actions *); void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); +int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, + struct ovs_key_nsh *nsh_mask); +int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, + size_t size); + #endif /* flow_netlink.h */ diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c new file mode 100644 index 000000000000..3fbfc78991ac --- /dev/null +++ b/net/openvswitch/meter.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2017 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/openvswitch.h> +#include <linux/netlink.h> +#include <linux/rculist.h> + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "datapath.h" +#include "meter.h" + +#define METER_HASH_BUCKETS 1024 + +static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { + [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, + [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, + [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, + [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, + [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, + [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { + [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, +}; + +static void ovs_meter_free(struct dp_meter *meter) +{ + if (!meter) + return; + + kfree_rcu(meter, rcu); +} + +static struct hlist_head *meter_hash_bucket(const struct datapath *dp, + u32 meter_id) +{ + return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; +} + +/* Call with ovs_mutex or RCU read lock. */ +static struct dp_meter *lookup_meter(const struct datapath *dp, + u32 meter_id) +{ + struct dp_meter *meter; + struct hlist_head *head; + + head = meter_hash_bucket(dp, meter_id); + hlist_for_each_entry_rcu(meter, head, dp_hash_node) { + if (meter->id == meter_id) + return meter; + } + return NULL; +} + +static void attach_meter(struct datapath *dp, struct dp_meter *meter) +{ + struct hlist_head *head = meter_hash_bucket(dp, meter->id); + + hlist_add_head_rcu(&meter->dp_hash_node, head); +} + +static void detach_meter(struct dp_meter *meter) +{ + ASSERT_OVSL(); + if (meter) + hlist_del_rcu(&meter->dp_hash_node); +} + +static struct sk_buff * +ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, + struct ovs_header **ovs_reply_header) +{ + struct sk_buff *skb; + struct ovs_header *ovs_header = info->userhdr; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + *ovs_reply_header = genlmsg_put(skb, info->snd_portid, + info->snd_seq, + &dp_meter_genl_family, 0, cmd); + if (!*ovs_reply_header) { + nlmsg_free(skb); + return ERR_PTR(-EMSGSIZE); + } + (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; + + return skb; +} + +static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, + struct dp_meter *meter) +{ + struct nlattr *nla; + struct dp_meter_band *band; + u16 i; + + if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) + goto error; + + if (!meter) + return 0; + + if (nla_put(reply, OVS_METER_ATTR_STATS, + sizeof(struct ovs_flow_stats), &meter->stats) || + nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + OVS_METER_ATTR_PAD)) + goto error; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto error; + + band = meter->bands; + + for (i = 0; i < meter->n_bands; ++i, ++band) { + struct nlattr *band_nla; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, + sizeof(struct ovs_flow_stats), + &band->stats)) + goto error; + nla_nest_end(reply, band_nla); + } + nla_nest_end(reply, nla); + + return 0; +error: + return -EMSGSIZE; +} + +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct nlattr *nla, *band_nla; + int err; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || + nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + goto nla_put_failure; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto nla_put_failure; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla) + goto nla_put_failure; + /* Currently only DROP band type is supported. */ + if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) + goto nla_put_failure; + nla_nest_end(reply, band_nla); + nla_nest_end(reply, nla); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +nla_put_failure: + nlmsg_free(reply); + err = -EMSGSIZE; + return err; +} + +static struct dp_meter *dp_meter_create(struct nlattr **a) +{ + struct nlattr *nla; + int rem; + u16 n_bands = 0; + struct dp_meter *meter; + struct dp_meter_band *band; + int err; + + /* Validate attributes, count the bands. */ + if (!a[OVS_METER_ATTR_BANDS]) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) + if (++n_bands > DP_MAX_BANDS) + return ERR_PTR(-EINVAL); + + /* Allocate and set up the meter before locking anything. */ + meter = kzalloc(n_bands * sizeof(struct dp_meter_band) + + sizeof(*meter), GFP_KERNEL); + if (!meter) + return ERR_PTR(-ENOMEM); + + meter->used = div_u64(ktime_get_ns(), 1000 * 1000); + meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; + meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; + spin_lock_init(&meter->lock); + if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { + meter->stats = *(struct ovs_flow_stats *) + nla_data(a[OVS_METER_ATTR_STATS]); + } + meter->n_bands = n_bands; + + /* Set up meter bands. */ + band = meter->bands; + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { + struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; + u32 band_max_delta_t; + + err = nla_parse((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, + nla_data(nla), nla_len(nla), band_policy, + NULL); + if (err) + goto exit_free_meter; + + if (!attr[OVS_BAND_ATTR_TYPE] || + !attr[OVS_BAND_ATTR_RATE] || + !attr[OVS_BAND_ATTR_BURST]) { + err = -EINVAL; + goto exit_free_meter; + } + + band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); + band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); + band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); + /* Figure out max delta_t that is enough to fill any bucket. + * Keep max_delta_t size to the bucket units: + * pkts => 1/1000 packets, kilobits => bits. + */ + band_max_delta_t = (band->burst_size + band->rate) * 1000; + /* Start with a full bucket. */ + band->bucket = band_max_delta_t; + if (band_max_delta_t > meter->max_delta_t) + meter->max_delta_t = band_max_delta_t; + band++; + } + + return meter; + +exit_free_meter: + kfree(meter); + return ERR_PTR(err); +} + +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct dp_meter *meter, *old_meter; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_header *ovs_header = info->userhdr; + struct datapath *dp; + int err; + u32 meter_id; + bool failed; + + meter = dp_meter_create(a); + if (IS_ERR_OR_NULL(meter)) + return PTR_ERR(meter); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, + &ovs_reply_header); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto exit_free_meter; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (!a[OVS_METER_ATTR_ID]) { + err = -ENODEV; + goto exit_unlock; + } + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + /* Cannot fail after this. */ + old_meter = lookup_meter(dp, meter_id); + detach_meter(old_meter); + attach_meter(dp, meter); + ovs_unlock(); + + /* Build response with the meter_id and stats from + * the old meter, if any. + */ + failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); + WARN_ON(failed); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + if (old_meter->keep_stats) { + err = ovs_meter_cmd_reply_stats(reply, meter_id, + old_meter); + WARN_ON(err); + } + spin_unlock_bh(&old_meter->lock); + ovs_meter_free(old_meter); + } + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); +exit_free_meter: + kfree(meter); + return err; +} + +static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + /* Locate meter, copy stats. */ + meter = lookup_meter(dp, meter_id); + if (!meter) { + err = -ENOENT; + goto exit_unlock; + } + + spin_lock_bh(&meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); + spin_unlock_bh(&meter->lock); + if (err) + goto exit_unlock; + + ovs_unlock(); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *old_meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + old_meter = lookup_meter(dp, meter_id); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); + WARN_ON(err); + spin_unlock_bh(&old_meter->lock); + detach_meter(old_meter); + } + ovs_unlock(); + ovs_meter_free(old_meter); + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +/* Meter action execution. + * + * Return true 'meter_id' drop band is triggered. The 'skb' should be + * dropped by the caller'. + */ +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id) +{ + struct dp_meter *meter; + struct dp_meter_band *band; + long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); + long long int long_delta_ms; + u32 delta_ms; + u32 cost; + int i, band_exceeded_max = -1; + u32 band_exceeded_rate = 0; + + meter = lookup_meter(dp, meter_id); + /* Do not drop the packet when there is no meter. */ + if (!meter) + return false; + + /* Lock the meter while using it. */ + spin_lock(&meter->lock); + + long_delta_ms = (now_ms - meter->used); /* ms */ + + /* Make sure delta_ms will not be too large, so that bucket will not + * wrap around below. + */ + delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) + ? meter->max_delta_t : (u32)long_delta_ms; + + /* Update meter statistics. + */ + meter->used = now_ms; + meter->stats.n_packets += 1; + meter->stats.n_bytes += skb->len; + + /* Bucket rate is either in kilobits per second, or in packets per + * second. We maintain the bucket in the units of either bits or + * 1/1000th of a packet, correspondingly. + * Then, when rate is multiplied with milliseconds, we get the + * bucket units: + * msec * kbps = bits, and + * msec * packets/sec = 1/1000 packets. + * + * 'cost' is the number of bucket units in this packet. + */ + cost = (meter->kbps) ? skb->len * 8 : 1000; + + /* Update all bands and find the one hit with the highest rate. */ + for (i = 0; i < meter->n_bands; ++i) { + long long int max_bucket_size; + + band = &meter->bands[i]; + max_bucket_size = (band->burst_size + band->rate) * 1000; + + band->bucket += delta_ms * band->rate; + if (band->bucket > max_bucket_size) + band->bucket = max_bucket_size; + + if (band->bucket >= cost) { + band->bucket -= cost; + } else if (band->rate > band_exceeded_rate) { + band_exceeded_rate = band->rate; + band_exceeded_max = i; + } + } + + if (band_exceeded_max >= 0) { + /* Update band statistics. */ + band = &meter->bands[band_exceeded_max]; + band->stats.n_packets += 1; + band->stats.n_bytes += skb->len; + + /* Drop band triggered, let the caller drop the 'skb'. */ + if (band->type == OVS_METER_BAND_TYPE_DROP) { + spin_unlock(&meter->lock); + return true; + } + } + + spin_unlock(&meter->lock); + return false; +} + +static struct genl_ops dp_meter_genl_ops[] = { + { .cmd = OVS_METER_CMD_FEATURES, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_features + }, + { .cmd = OVS_METER_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .policy = meter_policy, + .doit = ovs_meter_cmd_set, + }, + { .cmd = OVS_METER_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_get, + }, + { .cmd = OVS_METER_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .policy = meter_policy, + .doit = ovs_meter_cmd_del + }, +}; + +static const struct genl_multicast_group ovs_meter_multicast_group = { + .name = OVS_METER_MCGROUP, +}; + +struct genl_family dp_meter_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_METER_FAMILY, + .version = OVS_METER_VERSION, + .maxattr = OVS_METER_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_meter_genl_ops, + .n_ops = ARRAY_SIZE(dp_meter_genl_ops), + .mcgrps = &ovs_meter_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, +}; + +int ovs_meters_init(struct datapath *dp) +{ + int i; + + dp->meters = kmalloc_array(METER_HASH_BUCKETS, + sizeof(struct hlist_head), GFP_KERNEL); + + if (!dp->meters) + return -ENOMEM; + + for (i = 0; i < METER_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->meters[i]); + + return 0; +} + +void ovs_meters_exit(struct datapath *dp) +{ + int i; + + for (i = 0; i < METER_HASH_BUCKETS; i++) { + struct hlist_head *head = &dp->meters[i]; + struct dp_meter *meter; + struct hlist_node *n; + + hlist_for_each_entry_safe(meter, n, head, dp_hash_node) + kfree(meter); + } + + kfree(dp->meters); +} diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h new file mode 100644 index 000000000000..964ace2650f8 --- /dev/null +++ b/net/openvswitch/meter.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#ifndef METER_H +#define METER_H 1 + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/genetlink.h> +#include <linux/skbuff.h> + +#include "flow.h" +struct datapath; + +#define DP_MAX_BANDS 1 + +struct dp_meter_band { + u32 type; + u32 rate; + u32 burst_size; + u32 bucket; /* 1/1000 packets, or in bits */ + struct ovs_flow_stats stats; +}; + +struct dp_meter { + spinlock_t lock; /* Per meter lock */ + struct rcu_head rcu; + struct hlist_node dp_hash_node; /*Element in datapath->meters + * hash table. + */ + u32 id; + u16 kbps:1, keep_stats:1; + u16 n_bands; + u32 max_delta_t; + u64 used; + struct ovs_flow_stats stats; + struct dp_meter_band bands[]; +}; + +extern struct genl_family dp_meter_genl_family; +int ovs_meters_init(struct datapath *dp); +void ovs_meters_exit(struct datapath *dp); +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id); + +#endif /* meter.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 0389398fa4ab..2e5e7a41d8ef 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), NULL, NULL); + get_dpdev(vport->dp), + NULL, NULL, NULL); if (err) goto error_unlock; |