From ff491a7334acfd74e515c896632e37e401f52676 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 5 Feb 2009 23:56:36 -0800 Subject: netlink: change return-value logic of netlink_broadcast() Currently, netlink_broadcast() reports errors to the caller if no messages at all were delivered: 1) If, at least, one message has been delivered correctly, returns 0. 2) Otherwise, if no messages at all were delivered due to skb_clone() failure, return -ENOBUFS. 3) Otherwise, if there are no listeners, return -ESRCH. With this patch, the caller knows if the delivery of any of the messages to the listeners have failed: 1) If it fails to deliver any message (for whatever reason), return -ENOBUFS. 2) Otherwise, if all messages were delivered OK, returns 0. 3) Otherwise, if no listeners, return -ESRCH. In the current ctnetlink code and in Netfilter in general, we can add reliable logging and connection tracking event delivery by dropping the packets whose events were not successfully delivered over Netlink. Of course, this option would be settable via /proc as this approach reduces performance (in terms of filtered connections per seconds by a stateful firewall) but providing reliable logging and event delivery (for conntrackd) in return. This patch also changes some clients of netlink_broadcast() that may report ENOBUFS errors via printk. This error handling is not of any help. Instead, the userspace daemons that are listening to those netlink messages should resync themselves with the kernel-side if they hit ENOBUFS. BTW, netlink_broadcast() clients include those that call cn_netlink_send(), nlmsg_multicast() and genlmsg_multicast() since they internally call netlink_broadcast() and return its error value. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9eb895c7a2a9..6ee69c27f806 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -950,6 +950,7 @@ struct netlink_broadcast_data { u32 pid; u32 group; int failure; + int delivery_failure; int congested; int delivered; gfp_t allocation; @@ -999,6 +1000,7 @@ static inline int do_one_broadcast(struct sock *sk, p->skb2 = NULL; } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); + p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; @@ -1025,6 +1027,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, info.pid = pid; info.group = group; info.failure = 0; + info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; @@ -1045,13 +1048,14 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, if (info.skb2) kfree_skb(info.skb2); + if (info.delivery_failure || info.failure) + return -ENOBUFS; + if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT)) yield(); return 0; } - if (info.failure) - return -ENOBUFS; return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast); -- cgit v1.2.3 From be0c22a46cfb79ab2342bb28fde99afa94ef868e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Feb 2009 01:40:43 +0000 Subject: netlink: add NETLINK_BROADCAST_ERROR socket option This patch adds NETLINK_BROADCAST_ERROR which is a netlink socket option that the listener can set to make netlink_broadcast() return errors in the delivery to the caller. This option is useful if the caller of netlink_broadcast() do something with the result of the message delivery, like in ctnetlink where it drops a network packet if the event delivery failed, this is used to enable reliable logging and state-synchronization. If this socket option is not set, netlink_broadcast() only reports ESRCH errors and silently ignore ENOBUFS errors, which is what most netlink_broadcast() callers should do. This socket option is based on a suggestion from Patrick McHardy. Patrick McHardy can exchange this patch for a beer from me ;). Signed-off-by: Pablo Neira Ayuso Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 + net/netlink/af_netlink.c | 25 +++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 51b09a1f46c3..1e6bf995435c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -103,6 +103,7 @@ struct nlmsgerr #define NETLINK_ADD_MEMBERSHIP 1 #define NETLINK_DROP_MEMBERSHIP 2 #define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 struct nl_pktinfo { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6ee69c27f806..ed587be1e1c2 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -85,6 +85,7 @@ struct netlink_sock { #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 +#define NETLINK_BROADCAST_SEND_ERROR 0x4 static inline struct netlink_sock *nlk_sk(struct sock *sk) { @@ -995,12 +996,15 @@ static inline int do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; } else if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); - p->delivery_failure = 1; + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; @@ -1048,7 +1052,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, if (info.skb2) kfree_skb(info.skb2); - if (info.delivery_failure || info.failure) + if (info.delivery_failure) return -ENOBUFS; if (info.delivered) { @@ -1163,6 +1167,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, err = 0; break; } + case NETLINK_BROADCAST_ERROR: + if (val) + nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + else + nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1195,6 +1206,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_BROADCAST_ERROR: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } -- cgit v1.2.3 From 1ce85fe402137824246bad03ff85f3913d565c17 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Feb 2009 23:18:28 -0800 Subject: netlink: change nlmsg_notify() return value logic This patch changes the return value of nlmsg_notify() as follows: If NETLINK_BROADCAST_ERROR is set by any of the listeners and an error in the delivery happened, return the broadcast error; else if there are no listeners apart from the socket that requested a change with the echo flag, return the result of the unicast notification. Thus, with this patch, the unicast notification is handled in the same way of a broadcast listener that has set the NETLINK_BROADCAST_ERROR socket flag. This patch is useful in case that the caller of nlmsg_notify() wants to know the result of the delivery of a netlink notification (including the broadcast delivery) and take any action in case that the delivery failed. For example, ctnetlink can drop packets if the event delivery failed to provide reliable logging and state-synchronization at the cost of dropping packets. This patch also modifies the rtnetlink code to ignore the return value of rtnl_notify() in all callers. The function rtnl_notify() (before this patch) returned the error of the unicast notification which makes rtnl_set_sk_err() reports errors to all listeners. This is not of any help since the origin of the change (the socket that requested the echoing) notices the ENOBUFS error if the notification fails and should resync itself. Signed-off-by: Pablo Neira Ayuso Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 4 ++-- net/bridge/br_netlink.c | 3 ++- net/core/fib_rules.c | 3 ++- net/core/neighbour.c | 3 ++- net/core/rtnetlink.c | 9 +++++---- net/decnet/dn_dev.c | 3 ++- net/decnet/dn_table.c | 3 ++- net/ipv4/devinet.c | 3 ++- net/ipv4/fib_semantics.c | 5 +++-- net/ipv6/addrconf.c | 9 ++++++--- net/ipv6/ndisc.c | 6 +----- net/ipv6/route.c | 5 +++-- net/netlink/af_netlink.c | 14 ++++++++++---- net/phonet/pn_netlink.c | 5 +++-- 14 files changed, 45 insertions(+), 30 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 1e5f6730ff31..35a07c830f79 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -622,8 +622,8 @@ static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str) extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); -extern int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags); +extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, + u32 group, struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ba7be195803c..fcffb3fb1177 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 32b3a0152d7a..98691e1466b8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, goto errout; } - err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, ops->nlgroup, err); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 278a142d1047..e1144cb94b99 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2534,7 +2534,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 790dd205bb5d..d78030f88bd0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) return nlmsg_unicast(rtnl, skb, pid); } -int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; int report = 0; @@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, if (nlh) report = nlmsg_report(nlh); - return nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, report, flags); } void rtnl_set_sk_err(struct net *net, u32 group, int error) @@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index daf2b98b15fe..e457769bf7a7 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -769,7 +769,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 69ad9280c693..67054b0d550f 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d519a6a66726..126bb911880f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4817dea3bc73..f831df500907 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, - info->nlh, GFP_KERNEL); + rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, + info->nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 03e2a1ad71e9..f8f76d6e21cb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3638,7 +3638,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); @@ -3849,7 +3850,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); @@ -3919,7 +3921,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3cd83b85e9ef..9f061d1adbc2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) &ipv6_hdr(ra)->saddr); nlmsg_end(skb, nlh); - err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, - GFP_ATOMIC); - if (err < 0) - goto errout; - + rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3d486a3edad..1394ddb6e35c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ed587be1e1c2..2760b62dc2c1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1760,12 +1760,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, exclude_pid = pid; } - /* errors reported via destination sk->sk_err */ - nlmsg_multicast(sk, skb, exclude_pid, group, flags); + /* errors reported via destination sk->sk_err, but propagate + * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ + err = nlmsg_multicast(sk, skb, exclude_pid, group, flags); } - if (report) - err = nlmsg_unicast(sk, skb, pid); + if (report) { + int err2; + + err2 = nlmsg_unicast(sk, skb, pid); + if (!err || err == -ESRCH) + err = err2; + } return err; } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 1ceea1f92413..cec4e5951681 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + rtnl_notify(skb, dev_net(dev), 0, + RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); -- cgit v1.2.3 From 91744f6559393697e13bf0f9f3f35f884e2520f9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 25 Feb 2009 00:34:41 +0000 Subject: netlink: remove some pointless conditionals before kfree_skb() Remove some pointless conditionals before kfree_skb(). Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2760b62dc2c1..e57d700bf6d9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1049,8 +1049,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, netlink_unlock_table(); - if (info.skb2) - kfree_skb(info.skb2); + kfree_skb(info.skb2); if (info.delivery_failure) return -ENOBUFS; @@ -1542,8 +1541,7 @@ EXPORT_SYMBOL(netlink_set_nonroot); static void netlink_destroy_callback(struct netlink_callback *cb) { - if (cb->skb) - kfree_skb(cb->skb); + kfree_skb(cb->skb); kfree(cb); } -- cgit v1.2.3 From dd5b6ce6fd465eab90357711c8e8124dc3a31ff0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 23 Mar 2009 13:21:06 +0100 Subject: nefilter: nfnetlink: add nfnetlink_set_err and use it in ctnetlink This patch adds nfnetlink_set_err() to propagate the error to netlink broadcast listener in case of memory allocation errors in the message building. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/linux/netfilter/nfnetlink.h | 1 + net/netfilter/nf_conntrack_netlink.c | 2 ++ net/netfilter/nfnetlink.c | 6 ++++++ net/netlink/af_netlink.c | 1 + 4 files changed, 10 insertions(+) (limited to 'net/netlink/af_netlink.c') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 7d8e0455ccac..135e5cfe68a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -76,6 +76,7 @@ extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); extern int nfnetlink_has_listeners(unsigned int group); extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo); +extern void nfnetlink_set_err(u32 pid, u32 group, int error); extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags); extern void nfnl_lock(void); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d1fe9d15ac5c..1b75c9efb0eb 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -518,6 +518,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, nla_put_failure: rcu_read_unlock(); nlmsg_failure: + nfnetlink_set_err(0, group, -ENOBUFS); kfree_skb(skb); return NOTIFY_DONE; } @@ -1514,6 +1515,7 @@ static int ctnetlink_expect_event(struct notifier_block *this, nla_put_failure: rcu_read_unlock(); nlmsg_failure: + nfnetlink_set_err(0, 0, -ENOBUFS); kfree_skb(skb); return NOTIFY_DONE; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 9c0ba17a1ddb..2785d66a7e38 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -113,6 +113,12 @@ int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) } EXPORT_SYMBOL_GPL(nfnetlink_send); +void nfnetlink_set_err(u32 pid, u32 group, int error) +{ + netlink_set_err(nfnl, pid, group, error); +} +EXPORT_SYMBOL_GPL(nfnetlink_set_err); + int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) { return netlink_unicast(nfnl, skb, pid, flags); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6ee69c27f806..5b33879c6422 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1106,6 +1106,7 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) read_unlock(&nl_table_lock); } +EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, -- cgit v1.2.3 From 38938bfe3489394e2eed5e40c9bb8f66a2ce1405 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Mar 2009 16:37:55 -0700 Subject: netlink: add NETLINK_NO_ENOBUFS socket flag This patch adds the NETLINK_NO_ENOBUFS socket flag. This flag can be used by unicast and broadcast listeners to avoid receiving ENOBUFS errors. Generally speaking, ENOBUFS errors are useful to notify two things to the listener: a) You may increase the receiver buffer size via setsockopt(). b) You have lost messages, you may be out of sync. In some cases, ignoring ENOBUFS errors can be useful. For example: a) nfnetlink_queue: this subsystem does not have any sort of resync method and you can decide to ignore ENOBUFS once you have set a given buffer size. b) ctnetlink: you can use this together with the socket flag NETLINK_BROADCAST_SEND_ERROR to stop getting ENOBUFS errors as you do not need to resync (packets whose event are not delivered are drop to provide reliable logging and state-synchronization). Moreover, the use of NETLINK_NO_ENOBUFS also reduces a "go up, go down" effect in terms of performance which is due to the netlink congestion control when the listener cannot back off. The effect is the following: 1) throughput rate goes up and netlink messages are inserted in the receiver buffer. 2) Then, netlink buffer fills and overruns (set on nlk->state bit 0). 3) While the listener empties the receiver buffer, netlink keeps dropping messages. Thus, throughput goes dramatically down. 4) Then, once the listener has emptied the buffer (nlk->state bit 0 is set off), goto step 1. This effect is easy to trigger with netlink broadcast under heavy load, and it is more noticeable when using a big receiver buffer. You can find some results in [1] that show this problem. [1] http://1984.lsi.us.es/linux/netlink/ This patch also includes the use of sk_drop to account the number of netlink messages drop due to overrun. This value is shown in /proc/net/netlink. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 + net/netlink/af_netlink.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1e6bf995435c..5ba398e90304 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -104,6 +104,7 @@ struct nlmsgerr #define NETLINK_DROP_MEMBERSHIP 2 #define NETLINK_PKTINFO 3 #define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 struct nl_pktinfo { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b73d4e61c5ac..8b6bbb3032b0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -86,6 +86,7 @@ struct netlink_sock { #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_RECV_NO_ENOBUFS 0x8 static inline struct netlink_sock *nlk_sk(struct sock *sk) { @@ -717,10 +718,15 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, static void netlink_overrun(struct sock *sk) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } } + atomic_inc(&sk->sk_drops); } static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) @@ -1182,6 +1188,15 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; err = 0; break; + case NETLINK_NO_ENOBUFS: + if (val) { + nlk->flags |= NETLINK_RECV_NO_ENOBUFS; + clear_bit(0, &nlk->state); + wake_up_interruptible(&nlk->wait); + } else + nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1224,6 +1239,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_NO_ENOBUFS: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1879,12 +1904,12 @@ static int netlink_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "sk Eth Pid Groups " - "Rmem Wmem Dump Locks\n"); + "Rmem Wmem Dump Locks Drops\n"); else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", + seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %-8d %-8d\n", s, s->sk_protocol, nlk->pid, @@ -1892,7 +1917,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v) atomic_read(&s->sk_rmem_alloc), atomic_read(&s->sk_wmem_alloc), nlk->cb, - atomic_read(&s->sk_refcnt) + atomic_read(&s->sk_refcnt), + atomic_read(&s->sk_drops) ); } -- cgit v1.2.3