diff options
author | David S. Miller <davem@davemloft.net> | 2012-05-08 22:40:21 +0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-05-08 22:40:21 +0400 |
commit | 9bb862beb6e5839e92f709d33fda07678f062f20 (patch) | |
tree | a2c396712c5a2cda380034173fd07a67bfa0489f /net/netfilter | |
parent | b44907e64cc1987153f6577306108379be1523b7 (diff) | |
parent | d16cf20e2f2f13411eece7f7fb72c17d141c4a84 (diff) | |
download | linux-9bb862beb6e5839e92f709d33fda07678f062f20.tar.xz |
Merge branch 'master' of git://1984.lsi.us.es/net-next
Diffstat (limited to 'net/netfilter')
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 70 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 30 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 70 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_dh.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ftp.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_lblc.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_lblcr.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto.c | 6 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sh.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sync.c | 662 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_wrr.c | 2 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 15 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_ecache.c | 10 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_helper.c | 122 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_netlink.c | 10 |
15 files changed, 714 insertions, 293 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 4a09b7873003..1548df9a7524 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -548,6 +548,7 @@ static inline void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) { unsigned int conn_flags; + __u32 flags; /* if dest is NULL, then return directly */ if (!dest) @@ -559,17 +560,19 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) conn_flags = atomic_read(&dest->conn_flags); if (cp->protocol != IPPROTO_UDP) conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; /* Bind with the destination and its corresponding transmitter */ - if (cp->flags & IP_VS_CONN_F_SYNC) { + if (flags & IP_VS_CONN_F_SYNC) { /* if the connection is not template and is created * by sync, preserve the activity flag. */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; /* connections inherit forwarding method from dest */ - cp->flags &= ~IP_VS_CONN_F_FWD_MASK; + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); } - cp->flags |= conn_flags; + flags |= conn_flags; + cp->flags = flags; cp->dest = dest; IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " @@ -584,12 +587,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_read(&dest->refcnt)); /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) atomic_inc(&dest->activeconns); else atomic_inc(&dest->inactconns); @@ -613,14 +616,40 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, - cp->dport, &cp->vaddr, cp->vport, - cp->protocol, cp->fwmark, cp->flags); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock(&cp->lock); + if (cp->dest) { + spin_unlock(&cp->lock); + return dest; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; + spin_unlock(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + return dest; } @@ -743,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct net *net = ip_vs_conn_net(cp); + struct netns_ipvs *ipvs = net_ipvs(net); cp->timeout = 60*HZ; @@ -808,6 +838,9 @@ static void ip_vs_conn_expire(unsigned long data) atomic_read(&cp->refcnt)-1, atomic_read(&cp->n_control)); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_conn_put(cp); } @@ -881,6 +914,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, /* Set its state and timeout */ cp->state = 0; cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c8f36b96f44f..a54b018c6eea 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1613,34 +1613,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) else pkts = atomic_add_return(1, &cp->in_pkts); - if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - cp->protocol == IPPROTO_SCTP) { - if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - (cp->old_state != cp->state && - ((cp->state == IP_VS_SCTP_S_CLOSED) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { - ip_vs_sync_conn(net, cp); - goto out; - } - } - - /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(net, cp); -out: - cp->old_state = cp->state; + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, pkts); ip_vs_conn_put(cp); return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 37b91996bfba..dd811b8dd97c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1599,6 +1599,10 @@ static int ip_vs_zero_all(struct net *net) } #ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3; + static int proc_do_defense_mode(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1632,7 +1636,8 @@ proc_do_sync_threshold(ctl_table *table, int write, memcpy(val, valp, sizeof(val)); rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + if (write && (valp[0] < 0 || valp[1] < 0 || + (valp[0] >= valp[1] && valp[1]))) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); } @@ -1652,9 +1657,24 @@ proc_do_sync_mode(ctl_table *table, int write, if ((*valp < 0) || (*valp > 1)) { /* Restore the correct value */ *valp = val; - } else { - struct net *net = current->nsproxy->net_ns; - ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} + +static int +proc_do_sync_ports(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (*valp < 1 || !is_power_of_2(*valp)) { + /* Restore the correct value */ + *valp = val; } } return rc; @@ -1718,6 +1738,24 @@ static struct ctl_table vs_vars[] = { .proc_handler = &proc_do_sync_mode, }, { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_ports, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, @@ -1743,6 +1781,20 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_do_sync_threshold, }, { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, @@ -3655,6 +3707,12 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; @@ -3662,6 +3720,10 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 1a53a7a2fff0..8b7dca9ea422 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -149,7 +149,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc) /* allocate the DH table for this service */ tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); + GFP_KERNEL); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 510f2b5a5855..b20b29c903ef 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -485,7 +485,7 @@ static struct pernet_operations ip_vs_ftp_ops = { .exit = __ip_vs_ftp_exit, }; -int __init ip_vs_ftp_init(void) +static int __init ip_vs_ftp_init(void) { int rv; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 9b0de9a0e08e..df646ccf08a7 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -342,7 +342,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 9dcd39a48897..570e31ea427a 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -511,7 +511,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index fdc82ad9cc0e..50d82186da87 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -68,7 +68,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = - kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); if (!pd) return -ENOMEM; @@ -156,7 +156,7 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -struct ip_vs_proto_data * +static struct ip_vs_proto_data * __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; @@ -199,7 +199,7 @@ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) int * ip_vs_create_timeout_table(int *table, int size) { - return kmemdup(table, size, GFP_ATOMIC); + return kmemdup(table, size, GFP_KERNEL); } diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 91e97ee049be..05126521743e 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -162,7 +162,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc) /* allocate the SH table for this service */ tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); + GFP_KERNEL); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index bf5e538af67b..effa10c9e4e3 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -196,6 +196,7 @@ struct ip_vs_sync_thread_data { struct net *net; struct socket *sock; char *buf; + int id; }; /* Version 0 definition of packet sizes */ @@ -271,13 +272,6 @@ struct ip_vs_sync_buff { unsigned char *end; }; -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), -}; - /* * Copy of struct ip_vs_seq * From unaligned network order to aligned host order @@ -300,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) put_unaligned_be32(ho->previous_delta, &no->previous_delta); } -static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_lock); - if (list_empty(&ipvs->sync_queue)) { + if (list_empty(&ms->sync_queue)) { sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); } else { - sb = list_entry(ipvs->sync_queue.next, - struct ip_vs_sync_buff, + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; } spin_unlock_bh(&ipvs->sync_lock); @@ -334,7 +332,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs) kfree(sb); return NULL; } - sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ipvs->master_syncid; sb->mesg->size = sizeof(struct ip_vs_sync_mesg); @@ -353,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct netns_ipvs *ipvs) +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) { - struct ip_vs_sync_buff *sb = ipvs->sync_buff; + struct ip_vs_sync_buff *sb = ms->sync_buff; spin_lock(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ipvs->sync_queue); - else + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) + wake_up_process(ms->master_thread); + } else ip_vs_sync_buff_release(sb); spin_unlock(&ipvs->sync_lock); } @@ -370,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_buff_lock); - if (ipvs->sync_buff && - time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { - sb = ipvs->sync_buff; - ipvs->sync_buff = NULL; + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); } else sb = NULL; spin_unlock_bh(&ipvs->sync_buff_lock); return sb; } -/* - * Switch mode from sending version 0 or 1 - * - must handle sync_buf - */ -void ip_vs_sync_switch_mode(struct net *net, int mode) +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(net); - - if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) - return; - if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) - return; - - spin_lock_bh(&ipvs->sync_buff_lock); - /* Buffer empty ? then let buf_create do the job */ - if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { - kfree(ipvs->sync_buff); - ipvs->sync_buff = NULL; - } else { - spin_lock_bh(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&ipvs->sync_buff->list, - &ipvs->sync_queue); - else - ip_vs_sync_buff_release(ipvs->sync_buff); - spin_unlock_bh(&ipvs->sync_lock); - } - spin_unlock_bh(&ipvs->sync_buff_lock); + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; } /* @@ -442,15 +425,101 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) return sb; } +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_CLOSED) | + (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) | + (1 << IP_VS_SCTP_S_SHUT_ACK_SER)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (sync_refresh_period <= 0 && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} + /* * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, + int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; int len; if (unlikely(cp->af != AF_INET)) @@ -459,21 +528,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return; + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + spin_lock(&ipvs->sync_buff_lock); - if (!ipvs->sync_buff) { - ipvs->sync_buff = - ip_vs_sync_buff_create_v0(ipvs); - if (!ipvs->sync_buff) { + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (!m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs); + if (!buff) { spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; } len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; - s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; /* copy members */ s->reserved = 0; @@ -494,18 +583,24 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) m->nr_conns++; m->size += len; - ipvs->sync_buff->head += len; + buff->head += len; /* check if there is a space for next one */ - if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + if (buff->head + FULL_CONN_SIZE > buff->end) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; } spin_unlock(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(net, cp->control); + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(net, cp->control, pkts); + } } /* @@ -513,23 +608,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; __u8 *p; unsigned int len, pe_name_len, pad; /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp); + ip_vs_sync_conn_v0(net, cp, pkts); return; } /* Do not sync ONE PACKET */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) goto control; sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + /* Sanity checks */ pe_name_len = 0; if (cp->pe_data_len) { @@ -541,6 +642,13 @@ sloop: } spin_lock(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -559,27 +667,32 @@ sloop: /* check if there is a space for this one */ pad = 0; - if (ipvs->sync_buff) { - pad = (4 - (size_t)ipvs->sync_buff->head) & 3; - if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; pad = 0; } } - if (!ipvs->sync_buff) { - ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); - if (!ipvs->sync_buff) { + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs); + if (!buff) { spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; + m = buff->mesg; } - m = ipvs->sync_buff->mesg; - p = ipvs->sync_buff->head; - ipvs->sync_buff->head += pad + len; + p = buff->head; + buff->head += pad + len; m->size += pad + len; /* Add ev. padding from prev. sync_conn */ while (pad--) @@ -644,16 +757,10 @@ control: cp = cp->control; if (!cp) return; - /* - * Reduce sync rate for templates - * i.e only increment in_pkts for Templates. - */ - if (cp->flags & IP_VS_CONN_F_TEMPLATE) { - int pkts = atomic_add_return(1, &cp->in_pkts); - - if (pkts % sysctl_sync_period(ipvs) != 1) - return; - } + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); goto sloop; } @@ -731,9 +838,32 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, else cp = ip_vs_ct_in_get(param); - if (cp && param->pe_data) /* Free pe_data */ + if (cp) { + /* Free pe_data */ kfree(param->pe_data); - if (!cp) { + + dest = cp->dest; + spin_lock(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock(&cp->lock); + if (!dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } + } else { /* * Find the appropriate destination for the connection. * If it is not found the connection will remain unbound @@ -742,18 +872,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); - /* Set the approprite ativity flag */ - if (protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); if (dest) atomic_dec(&dest->refcnt); @@ -763,34 +881,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } } if (opt) @@ -1149,6 +1239,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, /* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + sysctl_wmem_max); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + sysctl_rmem_max); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* * Setup loopback of outgoing multicasts on a sending socket */ static void set_mcast_loop(struct sock *sk, u_char loop) @@ -1298,9 +1410,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net) +static struct socket *make_send_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1324,6 +1442,9 @@ static struct socket *make_send_sock(struct net *net) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { @@ -1349,9 +1470,15 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net) +static struct socket *make_receive_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1369,6 +1496,9 @@ static struct socket *make_receive_sock(struct net *net) sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, sizeof(struct sockaddr)); @@ -1411,18 +1541,22 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) return len; } -static void +static int ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) { int msize; + int ret; msize = msg->size; /* Put size in network byte order */ msg->size = htons(msg->size); - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - pr_err("ip_vs_send_async error\n"); + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; } static int @@ -1438,48 +1572,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) iov.iov_base = buffer; iov.iov_len = (size_t)buflen; - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); if (len < 0) - return -1; + return len; LeaveFunction(7); return len; } +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ms->master_thread); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid); + "syncid = %d, id = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); - while (!kthread_should_stop()) { - while ((sb = sb_dequeue(ipvs))) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; } - - /* check if entries stay in ipvs->sync_buff for 2 seconds */ - sb = get_curr_sync_buff(ipvs, 2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + int ret = 0; + + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop(), + ret); + if (unlikely(kthread_should_stop())) + goto done; } - - schedule_timeout_interruptible(HZ); + ip_vs_sync_buff_release(sb); } +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + /* clean up the sync_buff queue */ - while ((sb = sb_dequeue(ipvs))) + while ((sb = sb_dequeue(ipvs, ms))) ip_vs_sync_buff_release(sb); + __set_current_state(TASK_RUNNING); /* clean up the current sync_buff */ - sb = get_curr_sync_buff(ipvs, 0); + sb = get_curr_sync_buff(ipvs, ms, 0); if (sb) ip_vs_sync_buff_release(sb); @@ -1498,8 +1674,8 @@ static int sync_thread_backup(void *data) int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid); + "syncid = %d, id = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1511,7 +1687,8 @@ static int sync_thread_backup(void *data) len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->recv_mesg_maxlen); if (len <= 0) { - pr_err("receiving message error\n"); + if (len != -EAGAIN) + pr_err("receiving message error\n"); break; } @@ -1535,86 +1712,140 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; + struct task_struct **array = NULL, *task; struct socket *sock; struct netns_ipvs *ipvs = net_ipvs(net); - char *name, *buf = NULL; + char *name; int (*threadfn)(void *data); + int id, count; int result = -ENOMEM; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; if (state == IP_VS_STATE_MASTER) { - if (ipvs->master_thread) + if (ipvs->ms) return -EEXIST; strlcpy(ipvs->master_mcast_ifn, mcast_ifn, sizeof(ipvs->master_mcast_ifn)); ipvs->master_syncid = syncid; - realtask = &ipvs->master_thread; - name = "ipvs_master:%d"; + name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; - sock = make_send_sock(net); } else if (state == IP_VS_STATE_BACKUP) { - if (ipvs->backup_thread) + if (ipvs->backup_threads) return -EEXIST; strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, sizeof(ipvs->backup_mcast_ifn)); ipvs->backup_syncid = syncid; - realtask = &ipvs->backup_thread; - name = "ipvs_backup:%d"; + name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(net); } else { return -EINVAL; } - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; - set_sync_mesg_maxlen(net, state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } else { + array = kzalloc(count * sizeof(struct task_struct *), + GFP_KERNEL); + if (!array) + goto out; } + set_sync_mesg_maxlen(net, state); - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->net = net; - tinfo->sock = sock; - tinfo->buf = buf; + tinfo = NULL; + for (id = 0; id < count; id++) { + if (state == IP_VS_STATE_MASTER) + sock = make_send_sock(net, id); + else + sock = make_receive_sock(net, id); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto outtinfo; + } + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outsocket; + tinfo->net = net; + tinfo->sock = sock; + if (state == IP_VS_STATE_BACKUP) { + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto outtinfo; + } + tinfo->id = id; - task = kthread_run(threadfn, tinfo, name, ipvs->gen); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + tinfo = NULL; + if (state == IP_VS_STATE_MASTER) + ipvs->ms[id].master_thread = task; + else + array[id] = task; } /* mark as active */ - *realtask = task; + + if (state == IP_VS_STATE_BACKUP) + ipvs->backup_threads = array; + spin_lock_bh(&ipvs->sync_buff_lock); ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); /* increase the module use count */ ip_vs_use_count_inc(); return 0; -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); outsocket: sk_release_kernel(sock->sk); + +outtinfo: + if (tinfo) { + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + } + count = id; + while (count-- > 0) { + if (state == IP_VS_STATE_MASTER) + kthread_stop(ipvs->ms[count].master_thread); + else + kthread_stop(array[count]); + } + kfree(array); + out: + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } return result; } @@ -1622,38 +1853,60 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + struct task_struct **array; + int id; int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!ipvs->master_thread) + if (!ipvs->ms) return -ESRCH; - pr_info("stopping master sync thread %d ...\n", - task_pid_nr(ipvs->master_thread)); - /* * The lock synchronizes with sb_queue_tail(), so that we don't * add sync buffers to the queue, when we are already in * progress of stopping the master sync daemon. */ - spin_lock_bh(&ipvs->sync_lock); + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ipvs->sync_lock); - retc = kthread_stop(ipvs->master_thread); - ipvs->master_thread = NULL; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ms->master_thread)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(ms->master_thread); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!ipvs->backup_thread) + if (!ipvs->backup_threads) return -ESRCH; - pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(ipvs->backup_thread)); - ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - retc = kthread_stop(ipvs->backup_thread); - ipvs->backup_thread = NULL; + array = ipvs->backup_threads; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(array[id])); + ret = kthread_stop(array[id]); + if (retc >= 0) + retc = ret; + } + kfree(array); + ipvs->backup_threads = NULL; } /* decrease the module use count */ @@ -1670,13 +1923,8 @@ int __net_init ip_vs_sync_net_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); - INIT_LIST_HEAD(&ipvs->sync_queue); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); - - ipvs->sync_mcast_addr.sin_family = AF_INET; - ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); - ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index fd0d4e09876a..231be7dd547a 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -84,7 +84,7 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) /* * Allocate the mark variable for WRR scheduling */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); if (mark == NULL) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cf0747c5741f..32c59093146e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1336,7 +1336,6 @@ static void nf_conntrack_cleanup_init_net(void) while (untrack_refs() > 0) schedule(); - nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); @@ -1354,6 +1353,7 @@ static void nf_conntrack_cleanup_net(struct net *net) } nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_helper_fini(net); nf_conntrack_timeout_fini(net); nf_conntrack_ecache_fini(net); nf_conntrack_tstamp_fini(net); @@ -1504,10 +1504,6 @@ static int nf_conntrack_init_init_net(void) if (ret < 0) goto err_proto; - ret = nf_conntrack_helper_init(); - if (ret < 0) - goto err_helper; - #ifdef CONFIG_NF_CONNTRACK_ZONES ret = nf_ct_extend_register(&nf_ct_zone_extend); if (ret < 0) @@ -1525,10 +1521,8 @@ static int nf_conntrack_init_init_net(void) #ifdef CONFIG_NF_CONNTRACK_ZONES err_extend: - nf_conntrack_helper_fini(); -#endif -err_helper: nf_conntrack_proto_fini(); +#endif err_proto: return ret; } @@ -1589,9 +1583,14 @@ static int nf_conntrack_init_net(struct net *net) ret = nf_conntrack_timeout_init(net); if (ret < 0) goto err_timeout; + ret = nf_conntrack_helper_init(net); + if (ret < 0) + goto err_helper; return 0; +err_helper: + nf_conntrack_timeout_fini(net); err_timeout: nf_conntrack_ecache_fini(net); err_ecache: diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index b924f3a49a8e..e7be79e640de 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); int nf_conntrack_register_notifier(struct net *net, struct nf_ct_event_notifier *new) { - int ret = 0; + int ret; struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); @@ -95,8 +95,7 @@ int nf_conntrack_register_notifier(struct net *net, goto out_unlock; } rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); - mutex_unlock(&nf_ct_ecache_mutex); - return ret; + ret = 0; out_unlock: mutex_unlock(&nf_ct_ecache_mutex); @@ -121,7 +120,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); int nf_ct_expect_register_notifier(struct net *net, struct nf_exp_event_notifier *new) { - int ret = 0; + int ret; struct nf_exp_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); @@ -132,8 +131,7 @@ int nf_ct_expect_register_notifier(struct net *net, goto out_unlock; } rcu_assign_pointer(net->ct.nf_expect_event_cb, new); - mutex_unlock(&nf_ct_ecache_mutex); - return ret; + ret = 0; out_unlock: mutex_unlock(&nf_ct_ecache_mutex); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 436b7cb79ba4..4fa2ff961f5a 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -34,6 +34,67 @@ static struct hlist_head *nf_ct_helper_hash __read_mostly; static unsigned int nf_ct_helper_hsize __read_mostly; static unsigned int nf_ct_helper_count __read_mostly; +static bool nf_ct_auto_assign_helper __read_mostly = true; +module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); +MODULE_PARM_DESC(nf_conntrack_helper, + "Enable automatic conntrack helper assignment (default 1)"); + +#ifdef CONFIG_SYSCTL +static struct ctl_table helper_sysctl_table[] = { + { + .procname = "nf_conntrack_helper", + .data = &init_net.ct.sysctl_auto_assign_helper, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_auto_assign_helper; + + net->ct.helper_sysctl_header = + register_net_sysctl(net, "net/netfilter", table); + + if (!net->ct.helper_sysctl_header) { + pr_err("nf_conntrack_helper: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.helper_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.helper_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_helper_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_helper_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ @@ -118,17 +179,38 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; + struct net *net = nf_ct_net(ct); int ret = 0; + /* We already got a helper explicitly attached. The function + * nf_conntrack_alter_reply - in case NAT is in use - asks for looking + * the helper up again. Since now the user is in full control of + * making consistent helper configurations, skip this automatic + * re-lookup, otherwise we'll lose the helper. + */ + if (test_bit(IPS_HELPER_BIT, &ct->status)) + return 0; + if (tmpl != NULL) { help = nfct_help(tmpl); - if (help != NULL) + if (help != NULL) { helper = help->helper; + set_bit(IPS_HELPER_BIT, &ct->status); + } } help = nfct_help(ct); - if (helper == NULL) + if (net->ct.sysctl_auto_assign_helper && helper == NULL) { helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (unlikely(!net->ct.auto_assign_helper_warned && helper)) { + pr_info("nf_conntrack: automatic helper " + "assignment is deprecated and it will " + "be removed soon. Use the iptables CT target " + "to attach helpers instead.\n"); + net->ct.auto_assign_helper_warned = true; + } + } + if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); @@ -315,28 +397,44 @@ static struct nf_ct_ext_type helper_extend __read_mostly = { .id = NF_CT_EXT_HELPER, }; -int nf_conntrack_helper_init(void) +int nf_conntrack_helper_init(struct net *net) { int err; - nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); - if (!nf_ct_helper_hash) - return -ENOMEM; + net->ct.auto_assign_helper_warned = false; + net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; + + if (net_eq(net, &init_net)) { + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ + nf_ct_helper_hash = + nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); + if (!nf_ct_helper_hash) + return -ENOMEM; - err = nf_ct_extend_register(&helper_extend); + err = nf_ct_extend_register(&helper_extend); + if (err < 0) + goto err1; + } + + err = nf_conntrack_helper_init_sysctl(net); if (err < 0) - goto err1; + goto out_sysctl; return 0; +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&helper_extend); err1: nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); return err; } -void nf_conntrack_helper_fini(void) +void nf_conntrack_helper_fini(struct net *net) { - nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + nf_conntrack_helper_fini_sysctl(net); + if (net_eq(net, &init_net)) { + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + } } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 462ec2dbe561..6f4b00a8fc73 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2080,7 +2080,15 @@ static int ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { - return -EOPNOTSUPP; + if (cda[CTA_EXPECT_TIMEOUT]) { + if (!del_timer(&x->timeout)) + return -ETIME; + + x->timeout.expires = jiffies + + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; + add_timer(&x->timeout); + } + return 0; } static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { |