From 4282d60689d4f21b40692029080440cc58e8a17d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 11:36:55 -0500 Subject: tracefs: Add new tracefs file system Add a separate file system to handle the tracing directory. Currently it is part of debugfs, but that is starting to show its limits. One thing is that in order to access the tracing infrastructure, you need to mount debugfs. As that includes debugging from all sorts of sub systems in the kernel, it is not considered advisable to mount such an all encompassing debugging system. Having the tracing system in its own file systems gives access to the tracing sub system without needing to include all other systems. Another problem with tracing using the debugfs system is that the instances use mkdir to create sub buffers. debugfs does not support mkdir from userspace so to implement it, special hacks were used. By controlling the file system that the tracing infrastructure uses, this can be properly done without hacks. Signed-off-by: Steven Rostedt --- include/uapi/linux/magic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 7d664ea85ebd..7b1425a6b370 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -58,6 +58,8 @@ #define STACK_END_MAGIC 0x57AC6E9D +#define TRACEFS_MAGIC 0x74726163 + #define V9FS_MAGIC 0x01021997 #define BDEVFS_MAGIC 0x62646576 -- cgit v1.2.3 From cd67cd5eb25ae9a7bafbfd3d52d4c05e1d80af3b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Feb 2015 09:44:44 +0200 Subject: ipvs: use 64-bit rates in stats IPVS stats are limited to 2^(32-10) conns/s and packets/s, 2^(32-5) bytes/s. It is time to use 64 bits: * Change all conn/packet kernel counters to 64-bit and update them in u64_stats_update_{begin,end} section * In kernel use struct ip_vs_kstats instead of the user-space struct ip_vs_stats_user and use new func ip_vs_export_stats_user to export it to sockopt users to preserve compatibility with 32-bit values * Rename cpu counters "ustats" to "cnt" * To netlink users provide additionally 64-bit stats: IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64. Old stats remain for old binaries. * We can use ip_vs_copy_stats in ip_vs_stats_percpu_show Thanks to Chris Caputo for providing initial patch for ip_vs_est.c Signed-off-by: Chris Caputo Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 50 ++++++++---- include/uapi/linux/ip_vs.h | 7 +- net/netfilter/ipvs/ip_vs_core.c | 36 +++++---- net/netfilter/ipvs/ip_vs_ctl.c | 174 ++++++++++++++++++++++++++-------------- net/netfilter/ipvs/ip_vs_est.c | 102 ++++++++++++----------- 5 files changed, 226 insertions(+), 143 deletions(-) (limited to 'include/uapi') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 615b20b58545..a627fe690c19 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -365,15 +365,15 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u32 conns; /* connections scheduled */ - __u32 inpkts; /* incoming packets */ - __u32 outpkts; /* outgoing packets */ + __u64 conns; /* connections scheduled */ + __u64 inpkts; /* incoming packets */ + __u64 outpkts; /* outgoing packets */ __u64 inbytes; /* incoming bytes */ __u64 outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { - struct ip_vs_counters ustats; + struct ip_vs_counters cnt; struct u64_stats_sync syncp; }; @@ -383,23 +383,40 @@ struct ip_vs_estimator { u64 last_inbytes; u64 last_outbytes; - u32 last_conns; - u32 last_inpkts; - u32 last_outpkts; - - u32 cps; - u32 inpps; - u32 outpps; - u32 inbps; - u32 outbps; + u64 last_conns; + u64 last_inpkts; + u64 last_outpkts; + + u64 cps; + u64 inpps; + u64 outpps; + u64 inbps; + u64 outbps; +}; + +/* + * IPVS statistics object, 64-bit kernel version of struct ip_vs_stats_user + */ +struct ip_vs_kstats { + u64 conns; /* connections scheduled */ + u64 inpkts; /* incoming packets */ + u64 outpkts; /* outgoing packets */ + u64 inbytes; /* incoming bytes */ + u64 outbytes; /* outgoing bytes */ + + u64 cps; /* current connection rate */ + u64 inpps; /* current in packet rate */ + u64 outpps; /* current out packet rate */ + u64 inbps; /* current in byte rate */ + u64 outbps; /* current out byte rate */ }; struct ip_vs_stats { - struct ip_vs_stats_user ustats; /* statistics */ + struct ip_vs_kstats kstats; /* kernel statistics */ struct ip_vs_estimator est; /* estimator */ struct ip_vs_cpu_stats __percpu *cpustats; /* per cpu counters */ spinlock_t lock; /* spin lock */ - struct ip_vs_stats_user ustats0; /* reset values */ + struct ip_vs_kstats kstats0; /* reset values */ }; struct dst_entry; @@ -1388,8 +1405,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); -void ip_vs_read_estimator(struct ip_vs_stats_user *dst, - struct ip_vs_stats *stats); +void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index cabe95d5b461..3199243f2028 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -358,6 +358,8 @@ enum { IPVS_SVC_ATTR_PE_NAME, /* name of ct retriever */ + IPVS_SVC_ATTR_STATS64, /* nested attribute for service stats */ + __IPVS_SVC_ATTR_MAX, }; @@ -387,6 +389,8 @@ enum { IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */ + __IPVS_DEST_ATTR_MAX, }; @@ -410,7 +414,8 @@ enum { /* * Attributes used to describe service or destination entry statistics * - * Used inside nested attributes IPVS_SVC_ATTR_STATS and IPVS_DEST_ATTR_STATS + * Used inside nested attributes IPVS_SVC_ATTR_STATS, IPVS_DEST_ATTR_STATS, + * IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64. */ enum { IPVS_STATS_ATTR_UNSPEC = 0, diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 990decba1fe4..c9470c86308f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -119,24 +119,24 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); } } @@ -153,24 +153,24 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); } } @@ -183,13 +183,19 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) struct ip_vs_cpu_stats *s; s = this_cpu_ptr(cp->dest->stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e55759056361..6fd60059faf0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -729,9 +729,9 @@ static void ip_vs_trash_cleanup(struct net *net) } static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { -#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock_bh(&src->lock); @@ -746,6 +746,21 @@ ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) spin_unlock_bh(&src->lock); } +static void +ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) +{ + dst->conns = (u32)src->conns; + dst->inpkts = (u32)src->inpkts; + dst->outpkts = (u32)src->outpkts; + dst->inbytes = src->inbytes; + dst->outbytes = src->outbytes; + dst->cps = (u32)src->cps; + dst->inpps = (u32)src->inpps; + dst->outpps = (u32)src->outpps; + dst->inbps = (u32)src->inbps; + dst->outbps = (u32)src->outbps; +} + static void ip_vs_zero_stats(struct ip_vs_stats *stats) { @@ -753,7 +768,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) /* get current counters as zero point, rates are zeroed */ -#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c +#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); @@ -2044,7 +2059,7 @@ static const struct file_operations ip_vs_info_fops = { static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats_user show; + struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -2053,17 +2068,22 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, - show.inpkts, show.outpkts, - (unsigned long long) show.inbytes, - (unsigned long long) show.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)show.conns, + (unsigned long long)show.inpkts, + (unsigned long long)show.outpkts, + (unsigned long long)show.inbytes, + (unsigned long long)show.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq, "%8X %8X %8X %16X %16X\n", - show.cps, show.inpps, show.outpps, - show.inbps, show.outbps); + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", + (unsigned long long)show.cps, + (unsigned long long)show.inpps, + (unsigned long long)show.outpps, + (unsigned long long)show.inbps, + (unsigned long long)show.outbps); return 0; } @@ -2086,7 +2106,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; - struct ip_vs_stats_user rates; + struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ @@ -2098,41 +2118,41 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; - __u64 inbytes, outbytes; + u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin_irq(&u->syncp); - inbytes = u->ustats.inbytes; - outbytes = u->ustats.outbytes; + conns = u->cnt.conns; + inpkts = u->cnt.inpkts; + outpkts = u->cnt.outpkts; + inbytes = u->cnt.inbytes; + outbytes = u->cnt.outbytes; } while (u64_stats_fetch_retry_irq(&u->syncp, start)); - seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", - i, u->ustats.conns, u->ustats.inpkts, - u->ustats.outpkts, (__u64)inbytes, - (__u64)outbytes); + seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", + i, (u64)conns, (u64)inpkts, + (u64)outpkts, (u64)inbytes, + (u64)outbytes); } - spin_lock_bh(&tot_stats->lock); - - seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", - tot_stats->ustats.conns, tot_stats->ustats.inpkts, - tot_stats->ustats.outpkts, - (unsigned long long) tot_stats->ustats.inbytes, - (unsigned long long) tot_stats->ustats.outbytes); - - ip_vs_read_estimator(&rates, tot_stats); + ip_vs_copy_stats(&kstats, tot_stats); - spin_unlock_bh(&tot_stats->lock); + seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)kstats.conns, + (unsigned long long)kstats.inpkts, + (unsigned long long)kstats.outpkts, + (unsigned long long)kstats.inbytes, + (unsigned long long)kstats.outbytes); -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ +/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq, " %8X %8X %8X %16X %16X\n", - rates.cps, - rates.inpps, - rates.outpps, - rates.inbps, - rates.outbps); + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", + kstats.cps, + kstats.inpps, + kstats.outpps, + kstats.inbps, + kstats.outbps); return 0; } @@ -2400,6 +2420,7 @@ static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; + struct ip_vs_kstats kstats; sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; @@ -2411,7 +2432,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); + ip_vs_copy_stats(&kstats, &src->stats); + ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int @@ -2485,6 +2507,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; + struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { @@ -2506,7 +2529,8 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); + ip_vs_copy_stats(&kstats, &dest->stats); + ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; @@ -2798,25 +2822,51 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, - struct ip_vs_stats *stats) + struct ip_vs_kstats *kstats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + + if (!nl_stats) + return -EMSGSIZE; + + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) + goto nla_put_failure; + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, + struct ip_vs_kstats *kstats) { - struct ip_vs_stats_user ustats; struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) return -EMSGSIZE; - ip_vs_copy_stats(&ustats, stats); - - if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || - nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || - nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || - nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || - nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) || + nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) || + nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); @@ -2835,6 +2885,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; + struct ip_vs_kstats kstats; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2860,7 +2911,10 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + ip_vs_copy_stats(&kstats, &svc->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); @@ -3032,6 +3086,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; + struct ip_vs_kstats kstats; nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) @@ -3054,7 +3109,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + ip_vs_copy_stats(&kstats, &dest->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 1425e9a924c4..ef0eb0a8d552 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -45,17 +45,19 @@ NOTES. - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. - * A lot code is taken from net/sched/estimator.c + * Netlink users can see 64-bit values but sockopt users are restricted + to 32-bit values for conns, packets, bps, cps and pps. + + * A lot of code is taken from net/core/gen_estimator.c */ /* * Make a summary from each cpu */ -static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, +static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, struct ip_vs_cpu_stats __percpu *stats) { int i; @@ -64,27 +66,31 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; - __u64 inbytes, outbytes; + u64 conns, inpkts, outpkts, inbytes, outbytes; + if (add) { - sum->conns += s->ustats.conns; - sum->inpkts += s->ustats.inpkts; - sum->outpkts += s->ustats.outpkts; do { start = u64_stats_fetch_begin(&s->syncp); - inbytes = s->ustats.inbytes; - outbytes = s->ustats.outbytes; + conns = s->cnt.conns; + inpkts = s->cnt.inpkts; + outpkts = s->cnt.outpkts; + inbytes = s->cnt.inbytes; + outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->conns += conns; + sum->inpkts += inpkts; + sum->outpkts += outpkts; sum->inbytes += inbytes; sum->outbytes += outbytes; } else { add = true; - sum->conns = s->ustats.conns; - sum->inpkts = s->ustats.inpkts; - sum->outpkts = s->ustats.outpkts; do { start = u64_stats_fetch_begin(&s->syncp); - sum->inbytes = s->ustats.inbytes; - sum->outbytes = s->ustats.outbytes; + sum->conns = s->cnt.conns; + sum->inpkts = s->cnt.inpkts; + sum->outpkts = s->cnt.outpkts; + sum->inbytes = s->cnt.inbytes; + sum->outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); } } @@ -95,10 +101,7 @@ static void estimation_timer(unsigned long arg) { struct ip_vs_estimator *e; struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; + u64 rate; struct net *net = (struct net *)arg; struct netns_ipvs *ipvs; @@ -108,33 +111,29 @@ static void estimation_timer(unsigned long arg) s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->ustats, s->cpustats); - n_conns = s->ustats.conns; - n_inpkts = s->ustats.inpkts; - n_outpkts = s->ustats.outpkts; - n_inbytes = s->ustats.inbytes; - n_outbytes = s->ustats.outbytes; + ip_vs_read_cpu_stats(&s->kstats, s->cpustats); /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns) << 9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps) >> 2; - - rate = (n_inpkts - e->last_inpkts) << 9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps) >> 2; - - rate = (n_outpkts - e->last_outpkts) << 9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps) >> 2; - - rate = (n_inbytes - e->last_inbytes) << 4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps) >> 2; - - rate = (n_outbytes - e->last_outbytes) << 4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps) >> 2; + rate = (s->kstats.conns - e->last_conns) << 9; + e->last_conns = s->kstats.conns; + e->cps += ((s64)rate - (s64)e->cps) >> 2; + + rate = (s->kstats.inpkts - e->last_inpkts) << 9; + e->last_inpkts = s->kstats.inpkts; + e->inpps += ((s64)rate - (s64)e->inpps) >> 2; + + rate = (s->kstats.outpkts - e->last_outpkts) << 9; + e->last_outpkts = s->kstats.outpkts; + e->outpps += ((s64)rate - (s64)e->outpps) >> 2; + + /* scaled by 2^5, but divided 2 seconds */ + rate = (s->kstats.inbytes - e->last_inbytes) << 4; + e->last_inbytes = s->kstats.inbytes; + e->inbps += ((s64)rate - (s64)e->inbps) >> 2; + + rate = (s->kstats.outbytes - e->last_outbytes) << 4; + e->last_outbytes = s->kstats.outbytes; + e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); @@ -166,14 +165,14 @@ void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; - struct ip_vs_stats_user *u = &stats->ustats; + struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ - est->last_inbytes = u->inbytes; - est->last_outbytes = u->outbytes; - est->last_conns = u->conns; - est->last_inpkts = u->inpkts; - est->last_outpkts = u->outpkts; + est->last_inbytes = k->inbytes; + est->last_outbytes = k->outbytes; + est->last_conns = k->conns; + est->last_inpkts = k->inpkts; + est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; @@ -182,8 +181,7 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) } /* Get decoded rates */ -void ip_vs_read_estimator(struct ip_vs_stats_user *dst, - struct ip_vs_stats *stats) +void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; -- cgit v1.2.3 From e3eb3250d84ef97b766312345774367b6a310db8 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 5 Feb 2015 14:41:52 +0000 Subject: drm: add support for tiled/compressed/etc modifier in addfb2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In DRM/KMS we are lacking a good way to deal with tiled/compressed formats. Especially in the case of dmabuf/prime buffer sharing, where we cannot always rely on under-the-hood flags passed to driver specific gem-create ioctl to pass around these extra flags. The proposal is to add a per-plane format modifier. This allows to, if necessary, use different tiling patters for sub-sampled planes, etc. The format modifiers are added at the end of the ioctl struct, so for legacy userspace it will be zero padded. v1: original v1.5: increase modifier to 64b v2: Incorporate review comments from the big thread, plus a few more. - Add a getcap so that userspace doesn't have to jump through hoops. - Allow modifiers only when a flag is set. That way drivers know when they're dealing with old userspace and need to fish out e.g. tiling from other information. - After rolling out checks for ->modifier to all drivers I've decided that this is way too fragile and needs an explicit opt-in flag. So do that instead. - Add a define (just for documentation really) for the "NONE" modifier. Imo we don't need to add mask #defines since drivers really should only do exact matches against values defined with fourcc_mod_code. - Drop the Samsung tiling modifier on Rob's request since he's not yet sure whether that one is accurate. v3: - Also add a new ->modifier[] array to struct drm_framebuffer and fill it in drm_helper_mode_fill_fb_struct. Requested by Tvrkto Uruslin. - Remove TODO in comment and add code comment that modifiers should be properly documented, requested by Rob. Cc: Rob Clark Cc: Tvrtko Ursulin Cc: Laurent Pinchart Cc: Daniel Stone Cc: Ville Syrjälä Cc: Michel Dänzer Signed-off-by: Rob Clark (v1.5) Reviewed-by: Rob Clark Reviewed-by: Daniel Stone Acked-by: Dave Airlie Signed-off-by: Daniel Vetter --- drivers/gpu/drm/drm_crtc.c | 14 +++++++++++++- drivers/gpu/drm/drm_crtc_helper.c | 1 + drivers/gpu/drm/drm_ioctl.c | 3 +++ include/drm/drm_crtc.h | 4 ++++ include/uapi/drm/drm.h | 1 + include/uapi/drm/drm_fourcc.h | 32 ++++++++++++++++++++++++++++++++ include/uapi/drm/drm_mode.h | 9 +++++++++ 7 files changed, 63 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index ad2934ba0bd2..b15d720eda4c 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -3314,6 +3314,12 @@ static int framebuffer_check(const struct drm_mode_fb_cmd2 *r) DRM_DEBUG_KMS("bad pitch %u for plane %d\n", r->pitches[i], i); return -EINVAL; } + + if (r->modifier[i] && !(r->flags & DRM_MODE_FB_MODIFIERS)) { + DRM_DEBUG_KMS("bad fb modifier %llu for plane %d\n", + r->modifier[i], i); + return -EINVAL; + } } return 0; @@ -3327,7 +3333,7 @@ static struct drm_framebuffer *add_framebuffer_internal(struct drm_device *dev, struct drm_framebuffer *fb; int ret; - if (r->flags & ~DRM_MODE_FB_INTERLACED) { + if (r->flags & ~(DRM_MODE_FB_INTERLACED | DRM_MODE_FB_MODIFIERS)) { DRM_DEBUG_KMS("bad framebuffer flags 0x%08x\n", r->flags); return ERR_PTR(-EINVAL); } @@ -3343,6 +3349,12 @@ static struct drm_framebuffer *add_framebuffer_internal(struct drm_device *dev, return ERR_PTR(-EINVAL); } + if (r->flags & DRM_MODE_FB_MODIFIERS && + !dev->mode_config.allow_fb_modifiers) { + DRM_DEBUG_KMS("driver does not support fb modifiers\n"); + return ERR_PTR(-EINVAL); + } + ret = framebuffer_check(r); if (ret) return ERR_PTR(ret); diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c index b1979e7bdc88..3053aab968f9 100644 --- a/drivers/gpu/drm/drm_crtc_helper.c +++ b/drivers/gpu/drm/drm_crtc_helper.c @@ -837,6 +837,7 @@ void drm_helper_mode_fill_fb_struct(struct drm_framebuffer *fb, for (i = 0; i < 4; i++) { fb->pitches[i] = mode_cmd->pitches[i]; fb->offsets[i] = mode_cmd->offsets[i]; + fb->modifier[i] = mode_cmd->modifier[i]; } drm_fb_get_bpp_depth(mode_cmd->pixel_format, &fb->depth, &fb->bits_per_pixel); diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 3785d66721f2..a6d773a61c2d 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -321,6 +321,9 @@ static int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_ else req->value = 64; break; + case DRM_CAP_ADDFB2_MODIFIERS: + req->value = dev->mode_config.allow_fb_modifiers; + break; default: return -EINVAL; } diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index 0ebd9286b332..cee54c45eba9 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -202,6 +202,7 @@ struct drm_framebuffer { const struct drm_framebuffer_funcs *funcs; unsigned int pitches[4]; unsigned int offsets[4]; + uint64_t modifier[4]; unsigned int width; unsigned int height; /* depth can be 15 or 16 */ @@ -1152,6 +1153,9 @@ struct drm_mode_config { /* whether async page flip is supported or not */ bool async_page_flip; + /* whether the driver supports fb modifiers */ + bool allow_fb_modifiers; + /* cursor size */ uint32_t cursor_width, cursor_height; }; diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h index 01b2d6d0e355..ff6ef62d084b 100644 --- a/include/uapi/drm/drm.h +++ b/include/uapi/drm/drm.h @@ -630,6 +630,7 @@ struct drm_gem_open { */ #define DRM_CAP_CURSOR_WIDTH 0x8 #define DRM_CAP_CURSOR_HEIGHT 0x9 +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 /** DRM_IOCTL_GET_CAP ioctl argument type */ struct drm_get_cap { diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 646ae5f39f42..622109677747 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -132,4 +132,36 @@ #define DRM_FORMAT_YUV444 fourcc_code('Y', 'U', '2', '4') /* non-subsampled Cb (1) and Cr (2) planes */ #define DRM_FORMAT_YVU444 fourcc_code('Y', 'V', '2', '4') /* non-subsampled Cr (1) and Cb (2) planes */ + +/* + * Format Modifiers: + * + * Format modifiers describe, typically, a re-ordering or modification + * of the data in a plane of an FB. This can be used to express tiled/ + * swizzled formats, or compression, or a combination of the two. + * + * The upper 8 bits of the format modifier are a vendor-id as assigned + * below. The lower 56 bits are assigned as vendor sees fit. + */ + +/* Vendor Ids: */ +#define DRM_FORMAT_MOD_NONE 0 +#define DRM_FORMAT_MOD_VENDOR_INTEL 0x01 +#define DRM_FORMAT_MOD_VENDOR_AMD 0x02 +#define DRM_FORMAT_MOD_VENDOR_NV 0x03 +#define DRM_FORMAT_MOD_VENDOR_SAMSUNG 0x04 +#define DRM_FORMAT_MOD_VENDOR_QCOM 0x05 +/* add more to the end as needed */ + +#define fourcc_mod_code(vendor, val) \ + ((((u64)DRM_FORMAT_MOD_VENDOR_## vendor) << 56) | (val & 0x00ffffffffffffffL)) + +/* + * Format Modifier tokens: + * + * When adding a new token please document the layout with a code comment, + * similar to the fourcc codes above. drm_fourcc.h is considered the + * authoritative source for all of these. + */ + #endif /* DRM_FOURCC_H */ diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index ca788e01dab2..dbeba949462a 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -336,6 +336,7 @@ struct drm_mode_fb_cmd { }; #define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ +#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifer[] */ struct drm_mode_fb_cmd2 { __u32 fb_id; @@ -356,10 +357,18 @@ struct drm_mode_fb_cmd2 { * So it would consist of Y as offsets[0] and UV as * offsets[1]. Note that offsets[0] will generally * be 0 (but this is not required). + * + * To accommodate tiled, compressed, etc formats, a per-plane + * modifier can be specified. The default value of zero + * indicates "native" format as specified by the fourcc. + * Vendor specific modifier token. This allows, for example, + * different tiling/swizzling pattern on different planes. + * See discussion above of DRM_FORMAT_MOD_xxx. */ __u32 handles[4]; __u32 pitches[4]; /* pitch for each plane */ __u32 offsets[4]; /* offset of each plane */ + __u64 modifier[4]; /* ie, tiling, compressed (per plane) */ }; #define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 -- cgit v1.2.3 From 93b81f5102a7cd270a305c2741b17c8d44bb0629 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 10 Feb 2015 17:16:05 +0000 Subject: drm/i915: Add tiled framebuffer modifiers To be used from the new addfb2 extension. v2: - Drop Intel-specific untiled modfier. - Move to drm_fourcc.h. - Document layouts a bit and denote them as platform-specific and not useable for cross-driver sharing. - Add Y-tiling for completeness. - Drop special docstring markers to avoid confusing kerneldoc. v3: Give Y-tiling a unique idea, noticed by Tvrtko. Signed-off-by: Tvrtko Ursulin (v1) Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_drv.h | 1 + include/uapi/drm/drm_fourcc.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 3da3dc527315..99b25928df2f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -31,6 +31,7 @@ #define _I915_DRV_H_ #include +#include #include "i915_reg.h" #include "intel_bios.h" diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 622109677747..4837c3d2319a 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -164,4 +164,35 @@ * authoritative source for all of these. */ +/* Intel framebuffer modifiers */ + +/* + * Intel X-tiling layout + * + * This is a tiled layout using 4Kb tiles (except on gen2 where the tiles 2Kb) + * in row-major layout. Within the tile bytes are laid out row-major, with + * a platform-dependent stride. On top of that the memory can apply + * platform-depending swizzling of some higher address bits into bit6. + * + * This format is highly platforms specific and not useful for cross-driver + * sharing. It exists since on a given platform it does uniquely identify the + * layout in a simple way for i915-specific userspace. + */ +#define I915_FORMAT_MOD_X_TILED fourcc_mod_code(INTEL, 1) + +/* + * Intel Y-tiling layout + * + * This is a tiled layout using 4Kb tiles (except on gen2 where the tiles 2Kb) + * in row-major layout. Within the tile bytes are laid out in OWORD (16 bytes) + * chunks column-major, with a platform-dependent height. On top of that the + * memory can apply platform-depending swizzling of some higher address bits + * into bit6. + * + * This format is highly platforms specific and not useful for cross-driver + * sharing. It exists since on a given platform it does uniquely identify the + * layout in a simple way for i915-specific userspace. + */ +#define I915_FORMAT_MOD_Y_TILED fourcc_mod_code(INTEL, 2) + #endif /* DRM_FOURCC_H */ -- cgit v1.2.3 From 293487c8ecc1103f4625cea5e90e1ba0cc89660f Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Tue, 10 Feb 2015 18:33:51 +0200 Subject: iio: Export userspace IIO headers After UAPI header file split [1] all user-kernel interfaces were placed under include/uapi/. This patch moves IIO user specific API from: * include/linux/iio/events.h => include/uapi/linux/iio/events.h * include/linux/types.h => include/uapi/linux/types.h Now there is no need for nasty tricks to compile userspace programs (e.g iio_event_monitor). Just installing the kernel headers with make headers_install command does the job. [1] http://lwn.net/Articles/507794/ Signed-off-by: Daniel Baluta Signed-off-by: Jonathan Cameron --- include/linux/iio/events.h | 30 +------------- include/linux/iio/types.h | 78 +--------------------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/iio/Kbuild | 3 ++ include/uapi/linux/iio/events.h | 42 +++++++++++++++++++ include/uapi/linux/iio/types.h | 92 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 140 insertions(+), 106 deletions(-) create mode 100644 include/uapi/linux/iio/Kbuild create mode 100644 include/uapi/linux/iio/events.h create mode 100644 include/uapi/linux/iio/types.h (limited to 'include/uapi') diff --git a/include/linux/iio/events.h b/include/linux/iio/events.h index 03fa332ad2a8..8ad87d1c5340 100644 --- a/include/linux/iio/events.h +++ b/include/linux/iio/events.h @@ -9,22 +9,8 @@ #ifndef _IIO_EVENTS_H_ #define _IIO_EVENTS_H_ -#include -#include #include - -/** - * struct iio_event_data - The actual event being pushed to userspace - * @id: event identifier - * @timestamp: best estimate of time of event occurrence (often from - * the interrupt handler) - */ -struct iio_event_data { - __u64 id; - __s64 timestamp; -}; - -#define IIO_GET_EVENT_FD_IOCTL _IOR('i', 0x90, int) +#include /** * IIO_EVENT_CODE() - create event identifier @@ -70,18 +56,4 @@ struct iio_event_data { #define IIO_UNMOD_EVENT_CODE(chan_type, number, type, direction) \ IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0) -#define IIO_EVENT_CODE_EXTRACT_TYPE(mask) ((mask >> 56) & 0xFF) - -#define IIO_EVENT_CODE_EXTRACT_DIR(mask) ((mask >> 48) & 0x7F) - -#define IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(mask) ((mask >> 32) & 0xFF) - -/* Event code number extraction depends on which type of event we have. - * Perhaps review this function in the future*/ -#define IIO_EVENT_CODE_EXTRACT_CHAN(mask) ((__s16)(mask & 0xFFFF)) -#define IIO_EVENT_CODE_EXTRACT_CHAN2(mask) ((__s16)(((mask) >> 16) & 0xFFFF)) - -#define IIO_EVENT_CODE_EXTRACT_MODIFIER(mask) ((mask >> 40) & 0xFF) -#define IIO_EVENT_CODE_EXTRACT_DIFF(mask) (((mask) >> 55) & 0x1) - #endif diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 580ed5bdb3fa..942b6de68e2f 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -10,76 +10,7 @@ #ifndef _IIO_TYPES_H_ #define _IIO_TYPES_H_ -enum iio_chan_type { - IIO_VOLTAGE, - IIO_CURRENT, - IIO_POWER, - IIO_ACCEL, - IIO_ANGL_VEL, - IIO_MAGN, - IIO_LIGHT, - IIO_INTENSITY, - IIO_PROXIMITY, - IIO_TEMP, - IIO_INCLI, - IIO_ROT, - IIO_ANGL, - IIO_TIMESTAMP, - IIO_CAPACITANCE, - IIO_ALTVOLTAGE, - IIO_CCT, - IIO_PRESSURE, - IIO_HUMIDITYRELATIVE, - IIO_ACTIVITY, - IIO_STEPS, - IIO_ENERGY, - IIO_DISTANCE, - IIO_VELOCITY, -}; - -enum iio_modifier { - IIO_NO_MOD, - IIO_MOD_X, - IIO_MOD_Y, - IIO_MOD_Z, - IIO_MOD_X_AND_Y, - IIO_MOD_X_AND_Z, - IIO_MOD_Y_AND_Z, - IIO_MOD_X_AND_Y_AND_Z, - IIO_MOD_X_OR_Y, - IIO_MOD_X_OR_Z, - IIO_MOD_Y_OR_Z, - IIO_MOD_X_OR_Y_OR_Z, - IIO_MOD_LIGHT_BOTH, - IIO_MOD_LIGHT_IR, - IIO_MOD_ROOT_SUM_SQUARED_X_Y, - IIO_MOD_SUM_SQUARED_X_Y_Z, - IIO_MOD_LIGHT_CLEAR, - IIO_MOD_LIGHT_RED, - IIO_MOD_LIGHT_GREEN, - IIO_MOD_LIGHT_BLUE, - IIO_MOD_QUATERNION, - IIO_MOD_TEMP_AMBIENT, - IIO_MOD_TEMP_OBJECT, - IIO_MOD_NORTH_MAGN, - IIO_MOD_NORTH_TRUE, - IIO_MOD_NORTH_MAGN_TILT_COMP, - IIO_MOD_NORTH_TRUE_TILT_COMP, - IIO_MOD_RUNNING, - IIO_MOD_JOGGING, - IIO_MOD_WALKING, - IIO_MOD_STILL, - IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z, -}; - -enum iio_event_type { - IIO_EV_TYPE_THRESH, - IIO_EV_TYPE_MAG, - IIO_EV_TYPE_ROC, - IIO_EV_TYPE_THRESH_ADAPTIVE, - IIO_EV_TYPE_MAG_ADAPTIVE, - IIO_EV_TYPE_CHANGE, -}; +#include enum iio_event_info { IIO_EV_INFO_ENABLE, @@ -88,13 +19,6 @@ enum iio_event_info { IIO_EV_INFO_PERIOD, }; -enum iio_event_direction { - IIO_EV_DIR_EITHER, - IIO_EV_DIR_RISING, - IIO_EV_DIR_FALLING, - IIO_EV_DIR_NONE, -}; - #define IIO_VAL_INT 1 #define IIO_VAL_INT_PLUS_MICRO 2 #define IIO_VAL_INT_PLUS_NANO 3 diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 00b100023c47..5bfc5bdc3c5d 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -6,6 +6,7 @@ header-y += caif/ header-y += dvb/ header-y += hdlc/ header-y += hsi/ +header-y += iio/ header-y += isdn/ header-y += mmc/ header-y += nfsd/ diff --git a/include/uapi/linux/iio/Kbuild b/include/uapi/linux/iio/Kbuild new file mode 100644 index 000000000000..86f76d84c44f --- /dev/null +++ b/include/uapi/linux/iio/Kbuild @@ -0,0 +1,3 @@ +# UAPI Header export list +header-y += events.h +header-y += types.h diff --git a/include/uapi/linux/iio/events.h b/include/uapi/linux/iio/events.h new file mode 100644 index 000000000000..00bbdaed2f97 --- /dev/null +++ b/include/uapi/linux/iio/events.h @@ -0,0 +1,42 @@ +/* The industrial I/O - event passing to userspace + * + * Copyright (c) 2008-2011 Jonathan Cameron + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ +#ifndef _UAPI_IIO_EVENTS_H_ +#define _UAPI_IIO_EVENTS_H_ + +#include +#include + +/** + * struct iio_event_data - The actual event being pushed to userspace + * @id: event identifier + * @timestamp: best estimate of time of event occurrence (often from + * the interrupt handler) + */ +struct iio_event_data { + __u64 id; + __s64 timestamp; +}; + +#define IIO_GET_EVENT_FD_IOCTL _IOR('i', 0x90, int) + +#define IIO_EVENT_CODE_EXTRACT_TYPE(mask) ((mask >> 56) & 0xFF) + +#define IIO_EVENT_CODE_EXTRACT_DIR(mask) ((mask >> 48) & 0x7F) + +#define IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(mask) ((mask >> 32) & 0xFF) + +/* Event code number extraction depends on which type of event we have. + * Perhaps review this function in the future*/ +#define IIO_EVENT_CODE_EXTRACT_CHAN(mask) ((__s16)(mask & 0xFFFF)) +#define IIO_EVENT_CODE_EXTRACT_CHAN2(mask) ((__s16)(((mask) >> 16) & 0xFFFF)) + +#define IIO_EVENT_CODE_EXTRACT_MODIFIER(mask) ((mask >> 40) & 0xFF) +#define IIO_EVENT_CODE_EXTRACT_DIFF(mask) (((mask) >> 55) & 0x1) + +#endif /* _UAPI_IIO_EVENTS_H_ */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h new file mode 100644 index 000000000000..5c4601935005 --- /dev/null +++ b/include/uapi/linux/iio/types.h @@ -0,0 +1,92 @@ +/* industrial I/O data types needed both in and out of kernel + * + * Copyright (c) 2008 Jonathan Cameron + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#ifndef _UAPI_IIO_TYPES_H_ +#define _UAPI_IIO_TYPES_H_ + +enum iio_chan_type { + IIO_VOLTAGE, + IIO_CURRENT, + IIO_POWER, + IIO_ACCEL, + IIO_ANGL_VEL, + IIO_MAGN, + IIO_LIGHT, + IIO_INTENSITY, + IIO_PROXIMITY, + IIO_TEMP, + IIO_INCLI, + IIO_ROT, + IIO_ANGL, + IIO_TIMESTAMP, + IIO_CAPACITANCE, + IIO_ALTVOLTAGE, + IIO_CCT, + IIO_PRESSURE, + IIO_HUMIDITYRELATIVE, + IIO_ACTIVITY, + IIO_STEPS, + IIO_ENERGY, + IIO_DISTANCE, + IIO_VELOCITY, +}; + +enum iio_modifier { + IIO_NO_MOD, + IIO_MOD_X, + IIO_MOD_Y, + IIO_MOD_Z, + IIO_MOD_X_AND_Y, + IIO_MOD_X_AND_Z, + IIO_MOD_Y_AND_Z, + IIO_MOD_X_AND_Y_AND_Z, + IIO_MOD_X_OR_Y, + IIO_MOD_X_OR_Z, + IIO_MOD_Y_OR_Z, + IIO_MOD_X_OR_Y_OR_Z, + IIO_MOD_LIGHT_BOTH, + IIO_MOD_LIGHT_IR, + IIO_MOD_ROOT_SUM_SQUARED_X_Y, + IIO_MOD_SUM_SQUARED_X_Y_Z, + IIO_MOD_LIGHT_CLEAR, + IIO_MOD_LIGHT_RED, + IIO_MOD_LIGHT_GREEN, + IIO_MOD_LIGHT_BLUE, + IIO_MOD_QUATERNION, + IIO_MOD_TEMP_AMBIENT, + IIO_MOD_TEMP_OBJECT, + IIO_MOD_NORTH_MAGN, + IIO_MOD_NORTH_TRUE, + IIO_MOD_NORTH_MAGN_TILT_COMP, + IIO_MOD_NORTH_TRUE_TILT_COMP, + IIO_MOD_RUNNING, + IIO_MOD_JOGGING, + IIO_MOD_WALKING, + IIO_MOD_STILL, + IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z, +}; + +enum iio_event_type { + IIO_EV_TYPE_THRESH, + IIO_EV_TYPE_MAG, + IIO_EV_TYPE_ROC, + IIO_EV_TYPE_THRESH_ADAPTIVE, + IIO_EV_TYPE_MAG_ADAPTIVE, + IIO_EV_TYPE_CHANGE, +}; + +enum iio_event_direction { + IIO_EV_DIR_EITHER, + IIO_EV_DIR_RISING, + IIO_EV_DIR_FALLING, + IIO_EV_DIR_NONE, +}; + +#endif /* _UAPI_IIO_TYPES_H_ */ + -- cgit v1.2.3 From 76a3aeac2f6c02ecf065fa9baa279dd54bf2d819 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 17 Feb 2015 00:05:27 +0100 Subject: hdspm.h: include stdint.h in userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes compilation error: sound/hdspm.h:43:2: error: unknown type name ‘uint32_t’ Signed-off-by: Mikko Rapeli Signed-off-by: Takashi Iwai --- include/uapi/sound/hdspm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/hdspm.h b/include/uapi/sound/hdspm.h index b357f1a5e29c..5737332d38f2 100644 --- a/include/uapi/sound/hdspm.h +++ b/include/uapi/sound/hdspm.h @@ -20,6 +20,12 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#ifdef __KERNEL__ +#include +#else +#include +#endif + /* Maximum channels is 64 even on 56Mode you have 64playbacks to matrix */ #define HDSPM_MAX_CHANNELS 64 -- cgit v1.2.3 From 4bebf7091aa15ec60edf0dcbc654410a87ca21fe Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 17 Feb 2015 00:05:38 +0100 Subject: include/uapi/sound/asound.h: include stdlib.h in userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes compiler errors like: error: field ‘trigger_tstamp’ has incomplete type error: invalid application of ‘sizeof’ to incomplete t ype ‘struct timespec’ Signed-off-by: Mikko Rapeli Signed-off-by: Takashi Iwai --- include/uapi/sound/asound.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 1f23cd635957..1fe3f4f3d696 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -25,6 +25,9 @@ #include +#ifndef __KERNEL__ +#include +#endif /* * protocol version -- cgit v1.2.3 From bbf91c1c5bfc00c2961f15657359ee7e87de4269 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 17 Feb 2015 00:05:42 +0100 Subject: include/uapi/sound/asequencer.h: include sound/asound.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compilation error: error: unknown type name ‘snd_seq_client_type_t’ snd_seq_client_type_t type; /* client type */ Signed-off-by: Mikko Rapeli Signed-off-by: Takashi Iwai --- include/uapi/sound/asequencer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/asequencer.h b/include/uapi/sound/asequencer.h index 09c8a00ea503..5a5fa4956ebd 100644 --- a/include/uapi/sound/asequencer.h +++ b/include/uapi/sound/asequencer.h @@ -22,6 +22,7 @@ #ifndef _UAPI__SOUND_ASEQUENCER_H #define _UAPI__SOUND_ASEQUENCER_H +#include /** version of the sequencer */ #define SNDRV_SEQ_VERSION SNDRV_PROTOCOL_VERSION (1, 0, 1) -- cgit v1.2.3 From b9956409c281931c74ba8d0a2b61a98076a58602 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 17 Feb 2015 00:05:43 +0100 Subject: include/uapi/sound/emu10k1.h: include sound/asound.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compilation errors like: error: field ‘id’ has incomplete type struct snd_ctl_elem_id id; /* full control ID definition */ Signed-off-by: Mikko Rapeli Signed-off-by: Takashi Iwai --- include/uapi/sound/emu10k1.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/emu10k1.h b/include/uapi/sound/emu10k1.h index d1bbaf78457a..ec1535bb6aed 100644 --- a/include/uapi/sound/emu10k1.h +++ b/include/uapi/sound/emu10k1.h @@ -23,8 +23,7 @@ #define _UAPI__SOUND_EMU10K1_H #include - - +#include /* * ---- FX8010 ---- -- cgit v1.2.3 From 27ac905b8f88d28779b0661809286b5ba2817d37 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:55:57 -0500 Subject: perf/x86/intel: Reduce lbr_sel_map[] size The index of lbr_sel_map is bit value of perf branch_sample_type. PERF_SAMPLE_BRANCH_MAX is 1024 at present, so each lbr_sel_map uses 4096 bytes. By using bit shift as index, we can reduce lbr_sel_map size to 40 bytes. This patch defines 'bit shift' for branch types, and use 'bit shift' to define lbr_sel_maps. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: jolsa@redhat.com Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1415156173-10035-2-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 4 +++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 54 ++++++++++++++---------------- include/uapi/linux/perf_event.h | 49 +++++++++++++++++++-------- 3 files changed, 64 insertions(+), 43 deletions(-) (limited to 'include/uapi') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index df525d2be1e8..0c45b22495dc 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -515,6 +515,10 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +enum { + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT, +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 58f1a94beaf0..8bc078f43a82 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -69,10 +69,6 @@ static enum { #define LBR_FROM_FLAG_IN_TX (1ULL << 62) #define LBR_FROM_FLAG_ABORT (1ULL << 61) -#define for_each_branch_sample_type(x) \ - for ((x) = PERF_SAMPLE_BRANCH_USER; \ - (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) - /* * x86control flow change classification * x86control flow changes include branches, interrupts, traps, faults @@ -403,14 +399,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) { struct hw_perf_event_extra *reg; u64 br_type = event->attr.branch_sample_type; - u64 mask = 0, m; - u64 v; + u64 mask = 0, v; + int i; - for_each_branch_sample_type(m) { - if (!(br_type & m)) + for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) { + if (!(br_type & (1ULL << i))) continue; - v = x86_pmu.lbr_sel_map[m]; + v = x86_pmu.lbr_sel_map[i]; if (v == LBR_NOT_SUPP) return -EOPNOTSUPP; @@ -678,35 +674,35 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* * Map interface branch filters onto LBR filters */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP - | LBR_IND_JMP | LBR_FAR, +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches */ - [PERF_SAMPLE_BRANCH_ANY_CALL] = + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include IND_JMP to capture IND_CALL */ - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, - [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL - | LBR_FAR, - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; /* core */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9b79abbd1ab8..e46b93279e3d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -152,21 +152,42 @@ enum perf_event_sample_format { * The branch types can be combined, however BRANCH_ANY covers all types * of branches and therefore it supersedes all the other types. */ +enum perf_branch_sample_type_shift { + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ +}; + enum perf_branch_sample_type { - PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ - PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ - PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ - - PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ - PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */ - PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */ - PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */ - PERF_SAMPLE_BRANCH_COND = 1U << 10, /* conditional branches */ - - PERF_SAMPLE_BRANCH_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = + 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = + 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = + 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = + 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; #define PERF_SAMPLE_BRANCH_PLM_ALL \ -- cgit v1.2.3 From 2c44b1936bb3b135a3fac8b3493394d42e51cf70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Nov 2014 10:36:45 +0100 Subject: perf/x86/intel: Expose LBR callstack to user space tooling With LBR call stack feature enable, there are three callchain options. Enable the 3rd callchain option (LBR callstack) to user space tooling. Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20141105093759.GQ10501@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 8 -------- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 8 ++++---- include/uapi/linux/perf_event.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 69c26b396cf4..a371d27d6795 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -523,14 +523,6 @@ struct x86_perf_task_context { int lbr_stack_state; }; -enum { - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT, - PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE, - - PERF_SAMPLE_BRANCH_CALL_STACK = - 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, -}; - #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 084f2eb20c8b..0473874109cb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -537,7 +537,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) u64 mask = 0, v; int i; - for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) { + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { if (!(br_type & (1ULL << i))) continue; @@ -821,7 +821,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* * Map interface branch filters onto LBR filters */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, @@ -840,7 +840,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, @@ -852,7 +852,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e46b93279e3d..1e3cd07cf76e 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -166,6 +166,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -175,18 +177,16 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, - PERF_SAMPLE_BRANCH_ANY_CALL = - 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ANY_RETURN = - 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, - PERF_SAMPLE_BRANCH_IND_CALL = - 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ABORT_TX = - 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; -- cgit v1.2.3 From 229d043096ea8e58829d37d35767afeac15997f5 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 13 Feb 2015 15:14:03 -0600 Subject: ALSA: core: selection of audio_tstamp type and accuracy reports Audio timestamps can be extracted from sample counters, wall clocks, PHC clocks (Ethernet AVB), on-demand synchronized snapshots. This patch provides the ability to report timestamping capabilities, select timestamp types and retrieve timestamp accuracy, if supported. Details can be found in Documentations/sound/alsa/timestamping.txt This functionality is introduced by reclaiming the reserved_aligned field introduced by commit9c7066aef4a5eb8e4063de28f06c508bf6f2963a in snd_pcm_status to provide userspace with selection/query capabilities. Additional driver_tstamp and audio_tstamp_accuracy fields are also added. snd_pcm_mmap_status remains a read-only structure with only the audio timestamp value accessible from user space. The selection of audio timestamp type is done through snd_pcm_status only This commit does not impact ABI and does not impact the default behavior. By default audio timestamp is aligned with hw_pointer and reports the DMA position. Backwards compatibility is handled by using the HDAudio wall clock for playback and the hw_ptr for all other cases. For timestamp selection a new STATUS_EXT ioctl is introduced with read/write parameters. Alsa-lib will be modified to make use of STATUS_EXT. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Takashi Iwai --- Documentation/sound/alsa/timestamping.txt | 200 ++++++++++++++++++++++++++++++ include/sound/pcm.h | 60 +++++++++ include/uapi/sound/asound.h | 34 ++++- 3 files changed, 290 insertions(+), 4 deletions(-) create mode 100644 Documentation/sound/alsa/timestamping.txt (limited to 'include/uapi') diff --git a/Documentation/sound/alsa/timestamping.txt b/Documentation/sound/alsa/timestamping.txt new file mode 100644 index 000000000000..0b191a23f534 --- /dev/null +++ b/Documentation/sound/alsa/timestamping.txt @@ -0,0 +1,200 @@ +The ALSA API can provide two different system timestamps: + +- Trigger_tstamp is the system time snapshot taken when the .trigger +callback is invoked. This snapshot is taken by the ALSA core in the +general case, but specific hardware may have synchronization +capabilities or conversely may only be able to provide a correct +estimate with a delay. In the latter two cases, the low-level driver +is responsible for updating the trigger_tstamp at the most appropriate +and precise moment. Applications should not rely solely on the first +trigger_tstamp but update their internal calculations if the driver +provides a refined estimate with a delay. + +- tstamp is the current system timestamp updated during the last +event or application query. +The difference (tstamp - trigger_tstamp) defines the elapsed time. + +The ALSA API provides reports two basic pieces of information, avail +and delay, which combined with the trigger and current system +timestamps allow for applications to keep track of the 'fullness' of +the ring buffer and the amount of queued samples. + +The use of these different pointers and time information depends on +the application needs: + +- 'avail' reports how much can be written in the ring buffer +- 'delay' reports the time it will take to hear a new sample after all +queued samples have been played out. + +When timestamps are enabled, the avail/delay information is reported +along with a snapshot of system time. Applications can select from +CLOCK_REALTIME (NTP corrections including going backwards), +CLOCK_MONOTONIC (NTP corrections but never going backwards), +CLOCK_MONOTIC_RAW (without NTP corrections) and change the mode +dynamically with sw_params + + +The ALSA API also provide an audio_tstamp which reflects the passage +of time as measured by different components of audio hardware. In +ascii-art, this could be represented as follows (for the playback +case): + + +--------------------------------------------------------------> time + ^ ^ ^ ^ ^ + | | | | | + analog link dma app FullBuffer + time time time time time + | | | | | + |< codec delay >|<--hw delay-->||<---avail->| + |<----------------- delay---------------------->| | + |<----ring buffer length---->| + +The analog time is taken at the last stage of the playback, as close +as possible to the actual transducer + +The link time is taken at the output of the SOC/chipset as the samples +are pushed on a link. The link time can be directly measured if +supported in hardware by sample counters or wallclocks (e.g. with +HDAudio 24MHz or PTP clock for networked solutions) or indirectly +estimated (e.g. with the frame counter in USB). + +The DMA time is measured using counters - typically the least reliable +of all measurements due to the bursty natured of DMA transfers. + +The app time corresponds to the time tracked by an application after +writing in the ring buffer. + +The application can query what the hardware supports, define which +audio time it wants reported by selecting the relevant settings in +audio_tstamp_config fields, get an estimate of the timestamp +accuracy. It can also request the delay-to-analog be included in the +measurement. Direct access to the link time is very interesting on +platforms that provide an embedded DSP; measuring directly the link +time with dedicated hardware, possibly synchronized with system time, +removes the need to keep track of internal DSP processing times and +latency. + +In case the application requests an audio tstamp that is not supported +in hardware/low-level driver, the type is overridden as DEFAULT and the +timestamp will report the DMA time based on the hw_pointer value. + +For backwards compatibility with previous implementations that did not +provide timestamp selection, with a zero-valued COMPAT timestamp type +the results will default to the HDAudio wall clock for playback +streams and to the DMA time (hw_ptr) in all other cases. + +The audio timestamp accuracy can be returned to user-space, so that +appropriate decisions are made: + +- for dma time (default), the granularity of the transfers can be + inferred from the steps between updates and in turn provide + information on how much the application pointer can be rewound + safely. + +- the link time can be used to track long-term drifts between audio + and system time using the (tstamp-trigger_tstamp)/audio_tstamp + ratio, the precision helps define how much smoothing/low-pass + filtering is required. The link time can be either reset on startup + or reported as is (the latter being useful to compare progress of + different streams - but may require the wallclock to be always + running and not wrap-around during idle periods). If supported in + hardware, the absolute link time could also be used to define a + precise start time (patches WIP) + +- including the delay in the audio timestamp may + counter-intuitively not increase the precision of timestamps, e.g. if a + codec includes variable-latency DSP processing or a chain of + hardware components the delay is typically not known with precision. + +The accuracy is reported in nanosecond units (using an unsigned 32-bit +word), which gives a max precision of 4.29s, more than enough for +audio applications... + +Due to the varied nature of timestamping needs, even for a single +application, the audio_tstamp_config can be changed dynamically. In +the STATUS ioctl, the parameters are read-only and do not allow for +any application selection. To work around this limitation without +impacting legacy applications, a new STATUS_EXT ioctl is introduced +with read/write parameters. ALSA-lib will be modified to make use of +STATUS_EXT and effectively deprecate STATUS. + +The ALSA API only allows for a single audio timestamp to be reported +at a time. This is a conscious design decision, reading the audio +timestamps from hardware registers or from IPC takes time, the more +timestamps are read the more imprecise the combined measurements +are. To avoid any interpretation issues, a single (system, audio) +timestamp is reported. Applications that need different timestamps +will be required to issue multiple queries and perform an +interpolation of the results + +In some hardware-specific configuration, the system timestamp is +latched by a low-level audio subsytem, and the information provided +back to the driver. Due to potential delays in the communication with +the hardware, there is a risk of misalignment with the avail and delay +information. To make sure applications are not confused, a +driver_timestamp field is added in the snd_pcm_status structure; this +timestamp shows when the information is put together by the driver +before returning from the STATUS and STATUS_EXT ioctl. in most cases +this driver_timestamp will be identical to the regular system tstamp. + +Examples of typestamping with HDaudio: + +1. DMA timestamp, no compensation for DMA+analog delay +$ ./audio_time -p --ts_type=1 +playback: systime: 341121338 nsec, audio time 342000000 nsec, systime delta -878662 +playback: systime: 426236663 nsec, audio time 427187500 nsec, systime delta -950837 +playback: systime: 597080580 nsec, audio time 598000000 nsec, systime delta -919420 +playback: systime: 682059782 nsec, audio time 683020833 nsec, systime delta -961051 +playback: systime: 852896415 nsec, audio time 853854166 nsec, systime delta -957751 +playback: systime: 937903344 nsec, audio time 938854166 nsec, systime delta -950822 + +2. DMA timestamp, compensation for DMA+analog delay +$ ./audio_time -p --ts_type=1 -d +playback: systime: 341053347 nsec, audio time 341062500 nsec, systime delta -9153 +playback: systime: 426072447 nsec, audio time 426062500 nsec, systime delta 9947 +playback: systime: 596899518 nsec, audio time 596895833 nsec, systime delta 3685 +playback: systime: 681915317 nsec, audio time 681916666 nsec, systime delta -1349 +playback: systime: 852741306 nsec, audio time 852750000 nsec, systime delta -8694 + +3. link timestamp, compensation for DMA+analog delay +$ ./audio_time -p --ts_type=2 -d +playback: systime: 341060004 nsec, audio time 341062791 nsec, systime delta -2787 +playback: systime: 426242074 nsec, audio time 426244875 nsec, systime delta -2801 +playback: systime: 597080992 nsec, audio time 597084583 nsec, systime delta -3591 +playback: systime: 682084512 nsec, audio time 682088291 nsec, systime delta -3779 +playback: systime: 852936229 nsec, audio time 852940916 nsec, systime delta -4687 +playback: systime: 938107562 nsec, audio time 938112708 nsec, systime delta -5146 + +Example 1 shows that the timestamp at the DMA level is close to 1ms +ahead of the actual playback time (as a side time this sort of +measurement can help define rewind safeguards). Compensating for the +DMA-link delay in example 2 helps remove the hardware buffering abut +the information is still very jittery, with up to one sample of +error. In example 3 where the timestamps are measured with the link +wallclock, the timestamps show a monotonic behavior and a lower +dispersion. + +Example 3 and 4 are with USB audio class. Example 3 shows a high +offset between audio time and system time due to buffering. Example 4 +shows how compensating for the delay exposes a 1ms accuracy (due to +the use of the frame counter by the driver) + +Example 3: DMA timestamp, no compensation for delay, delta of ~5ms +$ ./audio_time -p -Dhw:1 -t1 +playback: systime: 120174019 nsec, audio time 125000000 nsec, systime delta -4825981 +playback: systime: 245041136 nsec, audio time 250000000 nsec, systime delta -4958864 +playback: systime: 370106088 nsec, audio time 375000000 nsec, systime delta -4893912 +playback: systime: 495040065 nsec, audio time 500000000 nsec, systime delta -4959935 +playback: systime: 620038179 nsec, audio time 625000000 nsec, systime delta -4961821 +playback: systime: 745087741 nsec, audio time 750000000 nsec, systime delta -4912259 +playback: systime: 870037336 nsec, audio time 875000000 nsec, systime delta -4962664 + +Example 4: DMA timestamp, compensation for delay, delay of ~1ms +$ ./audio_time -p -Dhw:1 -t1 -d +playback: systime: 120190520 nsec, audio time 120000000 nsec, systime delta 190520 +playback: systime: 245036740 nsec, audio time 244000000 nsec, systime delta 1036740 +playback: systime: 370034081 nsec, audio time 369000000 nsec, systime delta 1034081 +playback: systime: 495159907 nsec, audio time 494000000 nsec, systime delta 1159907 +playback: systime: 620098824 nsec, audio time 619000000 nsec, systime delta 1098824 +playback: systime: 745031847 nsec, audio time 744000000 nsec, systime delta 1031847 diff --git a/include/sound/pcm.h b/include/sound/pcm.h index c0ddb7e69c28..60f0e48f7905 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -60,6 +60,9 @@ struct snd_pcm_hardware { struct snd_pcm_substream; +struct snd_pcm_audio_tstamp_config; /* definitions further down */ +struct snd_pcm_audio_tstamp_report; + struct snd_pcm_ops { int (*open)(struct snd_pcm_substream *substream); int (*close)(struct snd_pcm_substream *substream); @@ -281,6 +284,58 @@ struct snd_pcm_hw_constraint_ranges { struct snd_pcm_hwptr_log; +/* + * userspace-provided audio timestamp config to kernel, + * structure is for internal use only and filled with dedicated unpack routine + */ +struct snd_pcm_audio_tstamp_config { + /* 5 of max 16 bits used */ + u32 type_requested:4; + u32 report_delay:1; /* add total delay to A/D or D/A */ +}; + +static inline void snd_pcm_unpack_audio_tstamp_config(__u32 data, + struct snd_pcm_audio_tstamp_config *config) +{ + config->type_requested = data & 0xF; + config->report_delay = (data >> 4) & 1; +} + +/* + * kernel-provided audio timestamp report to user-space + * structure is for internal use only and read by dedicated pack routine + */ +struct snd_pcm_audio_tstamp_report { + /* 6 of max 16 bits used for bit-fields */ + + /* for backwards compatibility */ + u32 valid:1; + + /* actual type if hardware could not support requested timestamp */ + u32 actual_type:4; + + /* accuracy represented in ns units */ + u32 accuracy_report:1; /* 0 if accuracy unknown, 1 if accuracy field is valid */ + u32 accuracy; /* up to 4.29s, will be packed in separate field */ +}; + +static inline void snd_pcm_pack_audio_tstamp_report(__u32 *data, __u32 *accuracy, + const struct snd_pcm_audio_tstamp_report *report) +{ + u32 tmp; + + tmp = report->accuracy_report; + tmp <<= 4; + tmp |= report->actual_type; + tmp <<= 1; + tmp |= report->valid; + + *data &= 0xffff; /* zero-clear MSBs */ + *data |= (tmp << 16); + *accuracy = report->accuracy; +} + + struct snd_pcm_runtime { /* -- Status -- */ struct snd_pcm_substream *trigger_master; @@ -361,6 +416,11 @@ struct snd_pcm_runtime { struct snd_dma_buffer *dma_buffer_p; /* allocated buffer */ + /* -- audio timestamp config -- */ + struct snd_pcm_audio_tstamp_config audio_tstamp_config; + struct snd_pcm_audio_tstamp_report audio_tstamp_report; + struct timespec driver_tstamp; + #if defined(CONFIG_SND_PCM_OSS) || defined(CONFIG_SND_PCM_OSS_MODULE) /* -- OSS things -- */ struct snd_pcm_oss_runtime oss; diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 0e88e7a0f0eb..acef4e4d2735 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -267,10 +267,17 @@ typedef int __bitwise snd_pcm_subformat_t; #define SNDRV_PCM_INFO_JOINT_DUPLEX 0x00200000 /* playback and capture stream are somewhat correlated */ #define SNDRV_PCM_INFO_SYNC_START 0x00400000 /* pcm support some kind of sync go */ #define SNDRV_PCM_INFO_NO_PERIOD_WAKEUP 0x00800000 /* period wakeup can be disabled */ -#define SNDRV_PCM_INFO_HAS_WALL_CLOCK 0x01000000 /* has audio wall clock for audio/system time sync */ +#define SNDRV_PCM_INFO_HAS_WALL_CLOCK 0x01000000 /* (Deprecated)has audio wall clock for audio/system time sync */ +#define SNDRV_PCM_INFO_HAS_LINK_ATIME 0x01000000 /* report hardware link audio time, reset on startup */ +#define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME 0x02000000 /* report absolute hardware link audio time, not reset on startup */ +#define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ +#define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ + #define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ #define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ + + typedef int __bitwise snd_pcm_state_t; #define SNDRV_PCM_STATE_OPEN ((__force snd_pcm_state_t) 0) /* stream is open */ #define SNDRV_PCM_STATE_SETUP ((__force snd_pcm_state_t) 1) /* stream has a setup */ @@ -408,6 +415,22 @@ struct snd_pcm_channel_info { unsigned int step; /* samples distance in bits */ }; +enum { + /* + * first definition for backwards compatibility only, + * maps to wallclock/link time for HDAudio playback and DEFAULT/DMA time for everything else + */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT = 0, + + /* timestamp definitions */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT = 1, /* DMA time, reported as per hw_ptr */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK = 2, /* link time reported by sample or wallclock counter, reset on startup */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ABSOLUTE = 3, /* link time reported by sample or wallclock counter, not reset on startup */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ESTIMATED = 4, /* link time estimated indirectly */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED = 5, /* link time synchronized with system time */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED +}; + struct snd_pcm_status { snd_pcm_state_t state; /* stream state */ struct timespec trigger_tstamp; /* time when stream was started/stopped/paused */ @@ -419,9 +442,11 @@ struct snd_pcm_status { snd_pcm_uframes_t avail_max; /* max frames available on hw since last status */ snd_pcm_uframes_t overrange; /* count of ADC (capture) overrange detections from last status */ snd_pcm_state_t suspended_state; /* suspended stream state */ - __u32 reserved_alignment; /* must be filled with zero */ - struct timespec audio_tstamp; /* from sample counter or wall clock */ - unsigned char reserved[56-sizeof(struct timespec)]; /* must be filled with zero */ + __u32 audio_tstamp_data; /* needed for 64-bit alignment, used for configs/report to/from userspace */ + struct timespec audio_tstamp; /* sample counter, wall clock, PHC or on-demand sync'ed */ + struct timespec driver_tstamp; /* useful in case reference system tstamp is reported with delay */ + __u32 audio_tstamp_accuracy; /* in ns units, only valid if indicated in audio_tstamp_data */ + unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */ }; struct snd_pcm_mmap_status { @@ -534,6 +559,7 @@ enum { #define SNDRV_PCM_IOCTL_DELAY _IOR('A', 0x21, snd_pcm_sframes_t) #define SNDRV_PCM_IOCTL_HWSYNC _IO('A', 0x22) #define SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct snd_pcm_sync_ptr) +#define SNDRV_PCM_IOCTL_STATUS_EXT _IOWR('A', 0x24, struct snd_pcm_status) #define SNDRV_PCM_IOCTL_CHANNEL_INFO _IOR('A', 0x32, struct snd_pcm_channel_info) #define SNDRV_PCM_IOCTL_PREPARE _IO('A', 0x40) #define SNDRV_PCM_IOCTL_RESET _IO('A', 0x41) -- cgit v1.2.3 From c72638bdaabe9ea4b09003b9db7e1754f472fbed Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 13 Feb 2015 15:14:09 -0600 Subject: ALSA: bump PCM protocol to 2.0.13 Bump PCM protocol to enable use of STATUS_EXT ioctls, older apps will still use STATUS and audio timestamp configuration is not supported (backwards compatible behavior). Signed-off-by: Pierre-Louis Bossart Signed-off-by: Takashi Iwai --- include/uapi/sound/asound.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index acef4e4d2735..3d46e9a0cd2e 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -140,7 +140,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 12) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 13) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; -- cgit v1.2.3 From 30ff54765976e132674e3eae2071ed8ed494665c Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 23 Feb 2015 08:17:12 -0500 Subject: net: sched: export tc_connmark.h so it is uapi accessible Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 19d5219b0b99..242cf0c6e33d 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -9,3 +9,4 @@ header-y += tc_pedit.h header-y += tc_skbedit.h header-y += tc_vlan.h header-y += tc_bpf.h +header-y += tc_connmark.h -- cgit v1.2.3 From b5ff6e1637b683d5996ae11ac29afe406c0bee90 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 27 Feb 2015 11:15:17 +0000 Subject: drm/i915/skl: Add new displayable tiling formats Starting with SKL display engine can scan out Y, and newly introduced Yf tiling formats so add the latter to the frame buffer modifier space. v2: Definitions moved to drm_fourcc.h. v3: Try to document the format better. Signed-off-by: Tvrtko Ursulin Reviewed-by: Damien Lespiau Signed-off-by: Daniel Vetter --- include/uapi/drm/drm_fourcc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 4837c3d2319a..b35418b20dcf 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -195,4 +195,19 @@ */ #define I915_FORMAT_MOD_Y_TILED fourcc_mod_code(INTEL, 2) +/* + * Intel Yf-tiling layout + * + * This is a tiled layout using 4Kb tiles in row-major layout. + * Within the tile pixels are laid out in 16 256 byte units / sub-tiles which + * are arranged in four groups (two wide, two high) with column-major layout. + * Each group therefore consits out of four 256 byte units, which are also laid + * out as 2x2 column-major. + * 256 byte units are made out of four 64 byte blocks of pixels, producing + * either a square block or a 2:1 unit. + * 64 byte blocks of pixels contain four pixel rows of 16 bytes, where the width + * in pixel depends on the pixel depth. + */ +#define I915_FORMAT_MOD_Yf_TILED fourcc_mod_code(INTEL, 3) + #endif /* DRM_FOURCC_H */ -- cgit v1.2.3 From 93a714d6b53d87872e552dbb273544bdeaaf6e12 Mon Sep 17 00:00:00 2001 From: Madhu Challa Date: Wed, 25 Feb 2015 09:58:35 -0800 Subject: multicast: Extend ip address command to enable multicast group join/leave on Joining multicast group on ethernet level via "ip maddr" command would not work if we have an Ethernet switch that does igmp snooping since the switch would not replicate multicast packets on ports that did not have IGMP reports for the multicast addresses. Linux vxlan interfaces created via "ip link add vxlan" have the group option that enables then to do the required join. By extending ip address command with option "autojoin" we can get similar functionality for openvswitch vxlan interfaces as well as other tunneling mechanisms that need to receive multicast traffic. The kernel code is structured similar to how the vxlan driver does a group join / leave. example: ip address add 224.1.1.10/24 dev eth5 autojoin ip address del 224.1.1.10/24 dev eth5 Signed-off-by: Madhu Challa Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/netns/ipv6.h | 1 + include/uapi/linux/if_addr.h | 1 + net/ipv4/devinet.c | 31 +++++++++++++++++++++++++++++++ net/ipv4/igmp.c | 13 +++++++++++++ net/ipv6/addrconf.c | 38 +++++++++++++++++++++++++++++++++++--- net/ipv6/mcast.c | 20 ++++++++++++++++---- 7 files changed, 98 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index dbe225478adb..1b26c6c3fd7c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,7 @@ struct netns_ipv4 { struct sock *fibnl; struct sock * __percpu *icmp_sk; + struct sock *mc_autojoin_sk; struct inet_peer_base *peers; struct tcpm_hash_bucket *tcp_metrics_hash; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 69ae41f2098c..ca0db12cd089 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -67,6 +67,7 @@ struct netns_ipv6 { struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; + struct sock *mc_autojoin_sk; #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES struct mr6_table *mrt6; diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index dea10a87dfd1..40fdfea39714 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -50,6 +50,7 @@ enum { #define IFA_F_PERMANENT 0x80 #define IFA_F_MANAGETEMPADDR 0x100 #define IFA_F_NOPREFIXROUTE 0x200 +#define IFA_F_MCAUTOJOIN 0x400 struct ifa_cacheinfo { __u32 ifa_prefered; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3a8985c94581..5105759e4e00 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } +static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +{ + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ifa->ifa_address, + .imr_ifindex = ifa->ifa_dev->dev->ifindex, + }; + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = __ip_mc_join_group(sk, &mreq); + else + ret = __ip_mc_leave_group(sk, &mreq); + release_sock(sk); + + return ret; +} + static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -584,6 +604,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) continue; + if (ipv4_is_multicast(ifa->ifa_address)) + ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); + if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, + true, ifa); + + if (ret < 0) { + inet_free_ifa(ifa); + return ret; + } + } return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); } else { inet_free_ifa(ifa); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4b1172d73e03..5cb1ef4ce292 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #ifdef CONFIG_IP_MROUTE #include @@ -2740,6 +2741,7 @@ static const struct file_operations igmp_mcf_seq_fops = { static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; + int err; pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); if (!pde) @@ -2748,8 +2750,18 @@ static int __net_init igmp_net_init(struct net *net) &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; + err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, + SOCK_DGRAM, 0, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", + err); + goto out_sock; + } + return 0; +out_sock: + remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: @@ -2760,6 +2772,7 @@ static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); + inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98e4a63d72bb..783bccfcc060 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2464,6 +2464,23 @@ err_exit: return err; } +static int ipv6_mc_config(struct sock *sk, bool join, + const struct in6_addr *addr, int ifindex) +{ + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = __ipv6_sock_mc_join(sk, ifindex, addr); + else + ret = __ipv6_sock_mc_drop(sk, ifindex, addr); + release_sock(sk); + + return ret; +} + /* * Manual configuration of address on an interface */ @@ -2476,10 +2493,10 @@ static int inet6_addr_add(struct net *net, int ifindex, struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; + unsigned long timeout; + clock_t expires; int scope; u32 flags; - clock_t expires; - unsigned long timeout; ASSERT_RTNL(); @@ -2501,6 +2518,14 @@ static int inet6_addr_add(struct net *net, int ifindex, if (IS_ERR(idev)) return PTR_ERR(idev); + if (ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, + true, pfx, ifindex); + + if (ret < 0) + return ret; + } + scope = ipv6_addr_scope(pfx); timeout = addrconf_timeout_fixup(valid_lft, HZ); @@ -2542,6 +2567,9 @@ static int inet6_addr_add(struct net *net, int ifindex, in6_ifa_put(ifp); addrconf_verify_rtnl(); return 0; + } else if (ifa_flags & IFA_F_MCAUTOJOIN) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, + false, pfx, ifindex); } return PTR_ERR(ifp); @@ -2578,6 +2606,10 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, jiffies); ipv6_del_addr(ifp); addrconf_verify_rtnl(); + if (ipv6_addr_is_multicast(pfx)) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, + false, pfx, dev->ifindex); + } return 0; } } @@ -3945,7 +3977,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) /* We ignore other flags so far. */ ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | - IFA_F_NOPREFIXROUTE; + IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN; ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (ifa == NULL) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index e4955d019734..1dd1fedff9f4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2929,20 +2929,32 @@ static int __net_init igmp6_net_init(struct net *net) inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n", + err); + goto out_sock_create; + } + err = igmp6_proc_init(net); if (err) - goto out_sock_create; -out: - return err; + goto out_sock_create_autojoin; + + return 0; +out_sock_create_autojoin: + inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); out_sock_create: inet_ctl_sock_destroy(net->ipv6.igmp_sk); - goto out; +out: + return err; } static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); + inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); igmp6_proc_exit(net); } -- cgit v1.2.3 From 31f909a2c0abfc1a1a76b2981d28ac85d33210e7 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 24 Feb 2015 22:42:16 +0900 Subject: nl/mac80211: allow zero plink timeout to disable STA expiration Both wpa_supplicant and mac80211 have and inactivity timer. By default wpa_supplicant will be timed out in 5 minutes and mac80211's it is 30 minutes. If wpa_supplicant uses a longer timer than mac80211, it will get unexpected disconnection by mac80211. Using 0xffffffff instead as the configured value could solve this w/o changing the code, but due to integer overflow in the expression used this doesn't work. The expression is: (current jiffies) > (frame Rx jiffies + NL80211_MESHCONF_PLINK_TIMEOUT * 250) On 32bit system, the right side would overflow and be a very small value if NL80211_MESHCONF_PLINK_TIMEOUT is sufficiently large, causing unexpectedly early disconnections. Instead allow disabling the inactivity timer to avoid this situation, by passing the (previously invalid and useless) value 0. Signed-off-by: Masashi Honma [reword/rewrap commit log] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 ++- net/mac80211/mesh.c | 3 ++- net/wireless/nl80211.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 68b294e83944..2dcf9bba317c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3092,7 +3092,8 @@ enum nl80211_mesh_power_mode { * * @NL80211_MESHCONF_PLINK_TIMEOUT: If no tx activity is seen from a STA we've * established peering with for longer than this time (in seconds), then - * remove it from the STA's list of peers. Default is 30 minutes. + * remove it from the STA's list of peers. You may set this to 0 to disable + * the removal of the STA. Default is 30 minutes. * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 0c8b2a77d312..acf441ff9f4a 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -574,7 +574,8 @@ static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u32 changed; - ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); + if (ifmsh->mshcfg.plink_timeout > 0) + ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); mesh_path_expire(sdata); changed = mesh_accept_plinks_update(sdata); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d78fd8b54515..9c6e23ede5b2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5265,7 +5265,7 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 1, 0xffffffff, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); if (mask_out) -- cgit v1.2.3 From f1a66f85b74c5ef7b503f746ea97742dacd56419 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:43 +0100 Subject: ebpf: export BPF_PSEUDO_MAP_FD to uapi We need to export BPF_PSEUDO_MAP_FD to user space, as it's used in the ELF BPF loader where instructions are being loaded that need map fixups. An initial stage loads all maps into the kernel, and later on replaces related instructions in the eBPF blob with BPF_PSEUDO_MAP_FD as source register and the actual fd as immediate value. The kernel verifier recognizes this keyword and replaces the map fd with a real pointer internally. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 2 -- include/uapi/linux/bpf.h | 2 ++ samples/bpf/libbpf.h | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/filter.h b/include/linux/filter.h index caac2087a4d5..5e3863d5f666 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -145,8 +145,6 @@ struct bpf_prog_aux; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) -#define BPF_PSEUDO_MAP_FD 1 - /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..0248180bf2e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -120,6 +120,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCKET_FILTER, }; +#define BPF_PSEUDO_MAP_FD 1 + /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index 58c5fe1bdba1..a6bb7e9c22c3 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -92,7 +92,9 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) -#define BPF_PSEUDO_MAP_FD 1 +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ -- cgit v1.2.3 From 96be4325f443dbbfeb37d2a157675ac0736531a1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:46 +0100 Subject: ebpf: add sched_cls_type and map it to sk_filter's verifier ops As discussed recently and at netconf/netdev01, we want to prevent making bpf_verifier_ops registration available for modules, but have them at a controlled place inside the kernel instead. The reason for this is, that out-of-tree modules can go crazy and define and register any verfifier ops they want, doing all sorts of crap, even bypassing available GPLed eBPF helper functions. We don't want to offer such a shiny playground, of course, but keep strict control to ourselves inside the core kernel. This also encourages us to design eBPF user helpers carefully and generically, so they can be shared among various subsystems using eBPF. For the eBPF traffic classifier (cls_bpf), it's a good start to share the same helper facilities as we currently do in eBPF for socket filters. That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus one day if there's a good reason to diverge the set of helper functions from the set available to socket filters, we keep ABI compatibility. In future, we could place all bpf_prog_type_list at a central place, perhaps. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 7 +++++++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0248180bf2e2..3fa1af8a58d7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_SCHED_CLS, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..594d341f04db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1172,6 +1172,17 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + return true; + default: + return false; + } +} + /* verify safety of LD_ABS|LD_IND instructions: * - they can only appear in the programs where ctx == skb * - since they are wrappers of function calls, they scratch R1-R5 registers, @@ -1194,8 +1205,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *reg; int i, err; - if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { - verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + if (!may_access_skb(env->prog->aux->prog_type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); return -EINVAL; } diff --git a/net/core/filter.c b/net/core/filter.c index 741721233166..514d4082f326 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1166,9 +1166,16 @@ static struct bpf_prog_type_list sk_filter_type __read_mostly = { .type = BPF_PROG_TYPE_SOCKET_FILTER, }; +static struct bpf_prog_type_list sched_cls_type __read_mostly = { + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); + bpf_register_prog_type(&sched_cls_type); + return 0; } late_initcall(register_sk_filter_ops); -- cgit v1.2.3 From e2e9b6541dd4b31848079da80fe2253daaafb549 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:48 +0100 Subject: cls_bpf: add initial eBPF support for programmable classifiers This work extends the "classic" BPF programmable tc classifier by extending its scope also to native eBPF code! This allows for user space to implement own custom, 'safe' C like classifiers (or whatever other frontend language LLVM et al may provide in future), that can then be compiled with the LLVM eBPF backend to an eBPF elf file. The result of this can be loaded into the kernel via iproute2's tc. In the kernel, they can be JITed on major archs and thus run in native performance. Simple, minimal toy example to demonstrate the workflow: #include #include #include #include "tc_bpf_api.h" __section("classify") int cls_main(struct sk_buff *skb) { return (0x800 << 16) | load_byte(skb, ETH_HLEN + __builtin_offsetof(struct iphdr, tos)); } char __license[] __section("license") = "GPL"; The classifier can then be compiled into eBPF opcodes and loaded via tc, for example: clang -O2 -emit-llvm -c cls.c -o - | llc -march=bpf -filetype=obj -o cls.o tc filter add dev em1 parent 1: bpf cls.o [...] As it has been demonstrated, the scope can even reach up to a fully fledged flow dissector (similarly as in samples/bpf/sockex2_kern.c). For tc, maps are allowed to be used, but from kernel context only, in other words, eBPF code can keep state across filter invocations. In future, we perhaps may reattach from a different application to those maps e.g., to read out collected statistics/state. Similarly as in socket filters, we may extend functionality for eBPF classifiers over time depending on the use cases. For that purpose, cls_bpf programs are using BPF_PROG_TYPE_SCHED_CLS program type, so we can allow additional functions/accessors (e.g. an ABI compatible offset translation to skb fields/metadata). For an initial cls_bpf support, we allow the same set of helper functions as eBPF socket filters, but we could diverge at some point in time w/o problem. I was wondering whether cls_bpf and act_bpf could share C programs, I can imagine that at some point, we introduce i) further common handlers for both (or even beyond their scope), and/or if truly needed ii) some restricted function space for each of them. Both can be abstracted easily through struct bpf_verifier_ops in future. The context of cls_bpf versus act_bpf is slightly different though: a cls_bpf program will return a specific classid whereas act_bpf a drop/non-drop return code, latter may also in future mangle skbs. That said, we can surely have a "classify" and "action" section in a single object file, or considered mentioned constraint add a possibility of a shared section. The workflow for getting native eBPF running from tc [1] is as follows: for f_bpf, I've added a slightly modified ELF parser code from Alexei's kernel sample, which reads out the LLVM compiled object, sets up maps (and dynamically fixes up map fds) if any, and loads the eBPF instructions all centrally through the bpf syscall. The resulting fd from the loaded program itself is being passed down to cls_bpf, which looks up struct bpf_prog from the fd store, and holds reference, so that it stays available also after tc program lifetime. On tc filter destruction, it will then drop its reference. Moreover, I've also added the optional possibility to annotate an eBPF filter with a name (e.g. path to object file, or something else if preferred) so that when tc dumps currently installed filters, some more context can be given to an admin for a given instance (as opposed to just the file descriptor number). Last but not least, bpf_prog_get() and bpf_prog_put() needed to be exported, so that eBPF can be used from cls_bpf built as a module. Thanks to 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") I think this is of no concern since anything wanting to alter eBPF opcode after verification stage would crash the kernel. [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf Signed-off-by: Daniel Borkmann Cc: Jamal Hadi Salim Cc: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 + kernel/bpf/syscall.c | 2 + net/sched/cls_bpf.c | 206 ++++++++++++++++++++++++++++++++----------- 3 files changed, 158 insertions(+), 52 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 25731dfb3fcc..bf08e76bf505 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -397,6 +397,8 @@ enum { TCA_BPF_CLASSID, TCA_BPF_OPS_LEN, TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, __TCA_BPF_MAX, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d69449acbd0..669719ccc9ee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -419,6 +419,7 @@ void bpf_prog_put(struct bpf_prog *prog) bpf_prog_free(prog); } } +EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { @@ -466,6 +467,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) fdput(f); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD log_buf diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5f3ee9e4b5bf..6f7ed8f8e6ee 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -16,6 +16,8 @@ #include #include #include +#include + #include #include #include @@ -24,6 +26,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_DESCRIPTION("TC BPF based classifier"); +#define CLS_BPF_NAME_LEN 256 + struct cls_bpf_head { struct list_head plist; u32 hgen; @@ -32,18 +36,24 @@ struct cls_bpf_head { struct cls_bpf_prog { struct bpf_prog *filter; - struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct tcf_result res; struct list_head link; + struct tcf_result res; + struct tcf_exts exts; u32 handle; - u16 bpf_num_ops; + union { + u32 bpf_fd; + u16 bpf_num_ops; + }; + struct sock_filter *bpf_ops; + const char *bpf_name; struct tcf_proto *tp; struct rcu_head rcu; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FD] = { .type = NLA_U32 }, + [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -76,6 +86,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; } +static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) +{ + return !prog->bpf_ops; +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -94,8 +109,12 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); - bpf_prog_destroy(prog->filter); + if (cls_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else + bpf_prog_destroy(prog->filter); + kfree(prog->bpf_name); kfree(prog->bpf_ops); kfree(prog); } @@ -114,6 +133,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); + return 0; } @@ -151,69 +171,121 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, - struct cls_bpf_prog *prog, - unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) +static int cls_bpf_prog_from_ops(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) { struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; - u32 classid; int ret; - if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) - return -EINVAL; - - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); - if (ret < 0) - return ret; - - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); - if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) { - ret = -EINVAL; - goto errout; - } + if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) + return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); - if (bpf_size != nla_len(tb[TCA_BPF_OPS])) { - ret = -EINVAL; - goto errout; - } + if (bpf_size != nla_len(tb[TCA_BPF_OPS])) + return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (bpf_ops == NULL) { - ret = -ENOMEM; - goto errout; - } + if (bpf_ops == NULL) + return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto errout_free; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } - prog->bpf_num_ops = bpf_num_ops; prog->bpf_ops = bpf_ops; + prog->bpf_num_ops = bpf_num_ops; + prog->bpf_name = NULL; + prog->filter = fp; prog->res.classid = classid; + return 0; +} + +static int cls_bpf_prog_from_efd(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_BPF_NAME]), + nla_len(tb[TCA_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + prog->bpf_ops = NULL; + prog->bpf_fd = bpf_fd; + prog->bpf_name = name; + + prog->filter = fp; + prog->res.classid = classid; + + return 0; +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts exts; + bool is_bpf, is_ebpf; + u32 classid; + int ret; + + is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; + is_ebpf = tb[TCA_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : + cls_bpf_prog_from_efd(tb, prog, classid); + if (ret < 0) { + tcf_exts_destroy(&exts); + return ret; + } + tcf_bind_filter(tp, &prog->res, base); tcf_exts_change(tp, &prog->exts, &exts); return 0; -errout_free: - kfree(bpf_ops); -errout: - tcf_exts_destroy(&exts); - return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -297,11 +369,43 @@ errout: return ret; } +static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *tm) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; - struct nlattr *nest, *nla; + struct nlattr *nest; + int ret; if (prog == NULL) return skb->len; @@ -314,16 +418,14 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) - goto nla_put_failure; - nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * - sizeof(struct sock_filter)); - if (nla == NULL) + if (cls_bpf_is_ebpf(prog)) + ret = cls_bpf_dump_ebpf_info(prog, skb); + else + ret = cls_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); - if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; -- cgit v1.2.3 From ffc1199122d83d60ad99f9c55df32feb650b7bff Mon Sep 17 00:00:00 2001 From: "Janusz.Dziedzic@tieto.com" Date: Sat, 21 Feb 2015 16:52:39 +0100 Subject: cfg80211: add VHT support for IBSS Add NL80211_EXT_FEATURE_VHT_IBSS flag and VHT support for IBSS. Signed-off-by: Janusz Dziedzic Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2dcf9bba317c..8ee31f108407 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4328,11 +4328,13 @@ enum nl80211_feature_flags { /** * enum nl80211_ext_feature_index - bit index of extended features. + * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ enum nl80211_ext_feature_index { + NL80211_EXT_FEATURE_VHT_IBSS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9c6e23ede5b2..66666fdf1c8d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7265,8 +7265,18 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) break; case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: - if (rdev->wiphy.features & NL80211_FEATURE_HT_IBSS) - break; + if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + return -EINVAL; + break; + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + return -EINVAL; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_VHT_IBSS)) + return -EINVAL; + break; default: return -EINVAL; } -- cgit v1.2.3 From 5fc7432991a86678b38a2d700edbe8bcd29cc579 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Feb 2015 15:32:43 +0100 Subject: nl80211: add notes about userspace API/ABI modifications Add notes about userspace ABI/API modifications, including the fact that we decided that API submissions should come with a driver implementation. Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8ee31f108407..90c5aeb3cca7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -25,6 +25,19 @@ * */ +/* + * This header file defines the userspace API to the wireless stack. Please + * be careful not to break things - i.e. don't move anything around or so + * unless you can demonstrate that it breaks neither API nor ABI. + * + * Additions to the API should be accompanied by actual implementations in + * an upstream driver, so that example implementations exist in case there + * are ever concerns about the precise semantics of the API or changes are + * needed, and to ensure that code for dead (no longer implemented) API + * can actually be identified and removed. + * Nonetheless, semantics should also be documented carefully in this file. + */ + #include #define NL80211_GENL_NAME "nl80211" -- cgit v1.2.3 From 03c0566542f4c7a45ce3193f27cbf5700b506c18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 3 Mar 2015 19:13:56 -0600 Subject: mpls: Netlink commands to add, remove, and dump routes This change adds two new netlink routing attributes: RTA_VIA and RTA_NEWDST. RTA_VIA specifies the specifies the next machine to send a packet to like RTA_GATEWAY. RTA_VIA differs from RTA_GATEWAY in that it includes the address family of the address of the next machine to send a packet to. Currently the MPLS code supports addresses in AF_INET, AF_INET6 and AF_PACKET. For AF_INET and AF_INET6 the destination mac address is acquired from the neighbour table. For AF_PACKET the destination mac_address is specified in the netlink configuration. I think raw destination mac address support with the family AF_PACKET will prove useful. There is MPLS-TP which is defined to operate on machines that do not support internet packets of any flavor. Further seem to be corner cases where it can be useful. At this point I don't care much either way. RTA_NEWDST specifies the destination address to forward the packet with. MPLS typically changes it's destination address at every hop. For a swap operation RTA_NEWDST is specified with a length of one label. For a push operation RTA_NEWDST is specified with two or more labels. For a pop operation RTA_NEWDST is not specified or equivalently an emtpy RTAN_NEWDST is specified. Those new netlink attributes are used to implement handling of rt-netlink RTM_NEWROUTE, RTM_DELROUTE, and RTM_GETROUTE messages, to maintain the MPLS label table. rtm_to_route_config parses a netlink RTM_NEWROUTE or RTM_DELROUTE message, verify no unhandled attributes or unhandled values are present and sets up the data structures for mpls_route_add and mpls_route_del. I did my best to match up with the existing conventions with the caveats that MPLS addresses are all destination-specific-addresses, and so don't properly have a scope. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 8 ++ net/mpls/af_mpls.c | 229 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5cc5d66bf519..bad65550ae3e 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -303,6 +303,8 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, + RTA_VIA, + RTA_NEWDST, __RTA_MAX }; @@ -344,6 +346,12 @@ struct rtnexthop { #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) +/* RTA_VIA */ +struct rtvia { + __kernel_sa_family_t rtvia_family; + __u8 rtvia_addr[0]; +}; + /* RTM_CACHEINFO */ struct rta_cacheinfo { diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 2d6612a10e30..b4d7cec398d2 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -212,6 +212,11 @@ static struct packet_type mpls_packet_type __read_mostly = { .func = mpls_forward, }; +const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { + [RTA_DST] = { .type = NLA_U32 }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + struct mpls_route_config { u32 rc_protocol; u32 rc_ifindex; @@ -410,6 +415,22 @@ static struct notifier_block mpls_dev_notifier = { .notifier_call = mpls_dev_notify, }; +static int nla_put_via(struct sk_buff *skb, + u16 family, const void *addr, int alen) +{ + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + return -EMSGSIZE; + + via = nla_data(nla); + via->rtvia_family = family; + memcpy(via->rtvia_addr, addr, alen); + return 0; +} + int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]) { @@ -467,6 +488,210 @@ int nla_get_labels(const struct nlattr *nla, return 0; } +static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, + struct mpls_route_config *cfg) +{ + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + int index; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rtm = nlmsg_data(nlh); + memset(cfg, 0, sizeof(*cfg)); + + if (rtm->rtm_family != AF_MPLS) + goto errout; + if (rtm->rtm_dst_len != 20) + goto errout; + if (rtm->rtm_src_len != 0) + goto errout; + if (rtm->rtm_tos != 0) + goto errout; + if (rtm->rtm_table != RT_TABLE_MAIN) + goto errout; + /* Any value is acceptable for rtm_protocol */ + + /* As mpls uses destination specific addresses + * (or source specific address in the case of multicast) + * all addresses have universal scope. + */ + if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) + goto errout; + if (rtm->rtm_type != RTN_UNICAST) + goto errout; + if (rtm->rtm_flags != 0) + goto errout; + + cfg->rc_label = LABEL_NOT_SPECIFIED; + cfg->rc_protocol = rtm->rtm_protocol; + cfg->rc_nlflags = nlh->nlmsg_flags; + cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; + cfg->rc_nlinfo.nlh = nlh; + cfg->rc_nlinfo.nl_net = sock_net(skb->sk); + + for (index = 0; index <= RTA_MAX; index++) { + struct nlattr *nla = tb[index]; + if (!nla) + continue; + + switch(index) { + case RTA_OIF: + cfg->rc_ifindex = nla_get_u32(nla); + break; + case RTA_NEWDST: + if (nla_get_labels(nla, MAX_NEW_LABELS, + &cfg->rc_output_labels, + cfg->rc_output_label)) + goto errout; + break; + case RTA_DST: + { + u32 label_count; + if (nla_get_labels(nla, 1, &label_count, + &cfg->rc_label)) + goto errout; + + /* The first 16 labels are reserved, and may not be set */ + if (cfg->rc_label < 16) + goto errout; + + break; + } + case RTA_VIA: + { + struct rtvia *via = nla_data(nla); + cfg->rc_via_family = via->rtvia_family; + cfg->rc_via_alen = nla_len(nla) - 2; + if (cfg->rc_via_alen > MAX_VIA_ALEN) + goto errout; + + /* Validate the address family */ + switch(cfg->rc_via_family) { + case AF_PACKET: + break; + case AF_INET: + if (cfg->rc_via_alen != 4) + goto errout; + break; + case AF_INET6: + if (cfg->rc_via_alen != 16) + goto errout; + break; + default: + /* Unsupported address family */ + goto errout; + } + + memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen); + break; + } + default: + /* Unsupported attribute */ + goto errout; + } + } + + err = 0; +errout: + return err; +} + +static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_del(&cfg); +} + + +static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_add(&cfg); +} + +static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, + u32 label, struct mpls_route *rt, int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_MPLS; + rtm->rtm_dst_len = 20; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_protocol = rt->rt_protocol; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + + if (rt->rt_labels && + nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label)) + goto nla_put_failure; + if (nla_put_via(skb, rt->rt_via_family, rt->rt_via, rt->rt_via_alen)) + goto nla_put_failure; + if (rt->rt_dev && nla_put_u32(skb, RTA_OIF, rt->rt_dev->ifindex)) + goto nla_put_failure; + if (nla_put_labels(skb, RTA_DST, 1, &label)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int index; + + ASSERT_RTNL(); + + index = cb->args[0]; + if (index < 16) + index = 16; + + for (; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt; + rt = net->mpls.platform_label[index]; + if (!rt) + continue; + + if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + index, rt, NLM_F_MULTI) < 0) + break; + } + cb->args[0] = index; + + return skb->len; +} + static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; @@ -662,6 +887,9 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); + rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); err = 0; out: return err; @@ -674,6 +902,7 @@ module_init(mpls_init); static void __exit mpls_exit(void) { + rtnl_unregister_all(PF_MPLS); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); -- cgit v1.2.3 From 8de147dc8e2adea82b8a1a2a08fcc983330f6770 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 3 Mar 2015 19:14:31 -0600 Subject: mpls: Multicast route table change notifications Unlike IPv4 this code notifies on all cases where mpls routes are added or removed and it never automatically removes routes. Avoiding both the userspace confusion that is caused by omitting route updates and the possibility of a flood of netlink traffic when an interface goes doew. For now reserved labels are handled automatically and userspace is not notified. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ net/mpls/af_mpls.c | 60 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index bad65550ae3e..06f75a407f74 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -631,6 +631,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF RTNLGRP_MDB, #define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b4d7cec398d2..75a994a50381 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -36,6 +36,10 @@ struct mpls_route { /* next hop label forwarding entry */ static int zero = 0; static int label_limit = (1 << 20) - 1; +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags); + static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) { struct mpls_route *rt = NULL; @@ -246,6 +250,20 @@ static void mpls_rt_free(struct mpls_route *rt) kfree_rcu(rt, rt_rcu); } +static void mpls_notify_route(struct net *net, unsigned index, + struct mpls_route *old, struct mpls_route *new, + const struct nl_info *info) +{ + struct nlmsghdr *nlh = info ? info->nlh : NULL; + unsigned portid = info ? info->portid : 0; + int event = new ? RTM_NEWROUTE : RTM_DELROUTE; + struct mpls_route *rt = new ? new : old; + unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; + /* Ignore reserved labels for now */ + if (rt && (index >= 16)) + rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); +} + static void mpls_route_update(struct net *net, unsigned index, struct net_device *dev, struct mpls_route *new, const struct nl_info *info) @@ -260,6 +278,8 @@ static void mpls_route_update(struct net *net, unsigned index, old = rt; } + mpls_notify_route(net, index, old, new, info); + /* If we removed a route free it now */ mpls_rt_free(old); } @@ -692,6 +712,46 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static inline size_t lfib_nlmsg_size(struct mpls_route *rt) +{ + size_t payload = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */ + + nla_total_size(4); /* RTA_DST */ + if (rt->rt_labels) /* RTA_NEWDST */ + payload += nla_total_size(rt->rt_labels * 4); + if (rt->rt_dev) /* RTA_OIF */ + payload += nla_total_size(4); + return payload; +} + +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags) +{ + struct sk_buff *skb; + u32 seq = nlh ? nlh->nlmsg_seq : 0; + int err = -ENOBUFS; + + skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); + + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); +} + static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; -- cgit v1.2.3 From 98fc43864af9e74116eec81c290db048cded15d8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 1 Mar 2015 09:10:13 +0200 Subject: nl80211: prohibit mixing 'any' and regular wowlan triggers If the device supports waking up on 'any' signal - i.e. it continues operating as usual and wakes up the host on pretty much anything that happens, then it makes no sense to also configure the more restricted WoWLAN mode where the device operates more autonomously but also in a more restricted fashion. Currently only cw2100 supports both 'any' and other triggers, but it seems to be broken as it doesn't configure anything to the device, so we can't currently get into a situation where both even can correctly be configured. This is about to change (Intel devices are going to support both and have different behaviour depending on configuration) so make sure the conflicting modes cannot be configured. (It seems that cw2100 advertises 'any' and 'disconnect' as a means of saying that's what it will always do, but that isn't really the way this API was meant to be used nor does it actually mean anything as 'any' always implies 'disconnect' already, and the driver doesn't change device configuration in any way depending on the settings.) Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 90c5aeb3cca7..37e7f39441e5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3708,6 +3708,8 @@ struct nl80211_pattern_support { * @NL80211_WOWLAN_TRIG_ANY: wake up on any activity, do not really put * the chip into a special state -- works best with chips that have * support for low-power operation already (flag) + * Note that this mode is incompatible with all of the others, if + * any others are even supported by the device. * @NL80211_WOWLAN_TRIG_DISCONNECT: wake up on disconnect, the way disconnect * is detected is implementation-specific (flag) * @NL80211_WOWLAN_TRIG_MAGIC_PKT: wake up on magic packet (6x 0xff, followed diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 01874628ae00..07cef3d7653e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9105,6 +9105,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) const struct wiphy_wowlan_support *wowlan = rdev->wiphy.wowlan; int err, i; bool prev_enabled = rdev->wiphy.wowlan_config; + bool regular = false; if (!wowlan) return -EOPNOTSUPP; @@ -9132,12 +9133,14 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT)) return -EINVAL; new_triggers.disconnect = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) { if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT)) return -EINVAL; new_triggers.magic_pkt = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED]) @@ -9147,24 +9150,28 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (!(wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE)) return -EINVAL; new_triggers.gtk_rekey_failure = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST]) { if (!(wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ)) return -EINVAL; new_triggers.eap_identity_req = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE]) { if (!(wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE)) return -EINVAL; new_triggers.four_way_handshake = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_RFKILL_RELEASE]) { if (!(wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE)) return -EINVAL; new_triggers.rfkill_release = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) { @@ -9173,6 +9180,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) int rem, pat_len, mask_len, pkt_offset; struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; + regular = true; + nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], rem) n_patterns++; @@ -9234,6 +9243,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } if (tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION]) { + regular = true; err = nl80211_parse_wowlan_tcp( rdev, tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION], &new_triggers); @@ -9242,6 +9252,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } if (tb[NL80211_WOWLAN_TRIG_NET_DETECT]) { + regular = true; err = nl80211_parse_wowlan_nd( rdev, wowlan, tb[NL80211_WOWLAN_TRIG_NET_DETECT], &new_triggers); @@ -9249,6 +9260,17 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) goto error; } + /* The 'any' trigger means the device continues operating more or less + * as in its normal operation mode and wakes up the host on most of the + * normal interrupts (like packet RX, ...) + * It therefore makes little sense to combine with the more constrained + * wakeup trigger modes. + */ + if (new_triggers.any && regular) { + err = -EINVAL; + goto error; + } + ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL); if (!ntrig) { err = -ENOMEM; -- cgit v1.2.3 From 1a6ab46fa9c2bc9399694b4856ab7ea19c036485 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 4 Mar 2015 10:56:13 +0900 Subject: ALSA: Fix spelling typo in Documentation/DocBook/alsa-driver-api.xml This patch fix spelling typo found in alsa-driver-api.xml. It is because this file is generated from comments in source files, I have to fix source files. Signed-off-by: Masanari Iida Signed-off-by: Takashi Iwai --- include/sound/compress_driver.h | 4 ++-- include/sound/control.h | 2 +- include/sound/soc.h | 2 +- include/uapi/sound/compress_offload.h | 2 +- sound/core/pcm_dmaengine.c | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/sound/compress_driver.h b/include/sound/compress_driver.h index f48089d364c5..fa1d05512c09 100644 --- a/include/sound/compress_driver.h +++ b/include/sound/compress_driver.h @@ -70,7 +70,7 @@ struct snd_compr_runtime { * @device: device pointer * @direction: stream direction, playback/recording * @metadata_set: metadata set flag, true when set - * @next_track: has userspace signall next track transistion, true when set + * @next_track: has userspace signal next track transition, true when set * @private_data: pointer to DSP private data */ struct snd_compr_stream { @@ -95,7 +95,7 @@ struct snd_compr_stream { * and the stream properties * @get_params: retrieve the codec parameters, mandatory * @set_metadata: Set the metadata values for a stream - * @get_metadata: retreives the requested metadata values from stream + * @get_metadata: retrieves the requested metadata values from stream * @trigger: Trigger operations like start, pause, resume, drain, stop. * This callback is mandatory * @pointer: Retrieve current h/w pointer information. Mandatory diff --git a/include/sound/control.h b/include/sound/control.h index 75f3054023f7..95aad6d3fd1a 100644 --- a/include/sound/control.h +++ b/include/sound/control.h @@ -227,7 +227,7 @@ snd_ctl_add_slave(struct snd_kcontrol *master, struct snd_kcontrol *slave) * Add a virtual slave control to the given master. * Unlike snd_ctl_add_slave(), the element added via this function * is supposed to have volatile values, and get callback is called - * at each time quried from the master. + * at each time queried from the master. * * When the control peeks the hardware values directly and the value * can be changed by other means than the put callback of the element, diff --git a/include/sound/soc.h b/include/sound/soc.h index 0d1ade195628..cf0bb156d6da 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1469,7 +1469,7 @@ static inline struct snd_soc_codec *snd_soc_kcontrol_codec( } /** - * snd_soc_kcontrol_platform() - Returns the platform that registerd the control + * snd_soc_kcontrol_platform() - Returns the platform that registered the control * @kcontrol: The control for which to get the platform * * Note: This function will only work correctly if the control has been diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index 22ed8cb7800b..e00d8cbfc628 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -75,7 +75,7 @@ struct snd_compr_tstamp { /** * struct snd_compr_avail - avail descriptor * @avail: Number of bytes available in ring buffer for writing/reading - * @tstamp: timestamp infomation + * @tstamp: timestamp information */ struct snd_compr_avail { __u64 avail; diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c index 6542c4083594..fba365a78390 100644 --- a/sound/core/pcm_dmaengine.c +++ b/sound/core/pcm_dmaengine.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_request_channel); * * The function should usually be called from the pcm open callback. Note that * this function will use private_data field of the substream's runtime. So it - * is not availabe to your pcm driver implementation. + * is not available to your pcm driver implementation. */ int snd_dmaengine_pcm_open(struct snd_pcm_substream *substream, struct dma_chan *chan) @@ -328,7 +328,7 @@ EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_open); * This function will request a DMA channel using the passed filter function and * data. The function should usually be called from the pcm open callback. Note * that this function will use private_data field of the substream's runtime. So - * it is not availabe to your pcm driver implementation. + * it is not available to your pcm driver implementation. */ int snd_dmaengine_pcm_open_request_chan(struct snd_pcm_substream *substream, dma_filter_fn filter_fn, void *filter_data) -- cgit v1.2.3 From f3dddf2432e3123ef34b470129295641f7513d26 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 4 Mar 2015 14:10:26 -0800 Subject: HID: map telephony usage page Currently HID code maps usages from telephony page into BTN_0, BTN_1, etc keys which get interpreted by mousedev and userspace as left/right/middle button clicks, which is not really helpful. This change adds mappings for usages that have corresponding input event definitions, and leaves the rest unmapped. This can be changed when there are userspace consumers for more telephony usages. Signed-off-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 23 +++++++++++++++++++++++ include/linux/hid.h | 1 + include/uapi/linux/input.h | 4 ++++ 3 files changed, 28 insertions(+) (limited to 'include/uapi') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 052869d0ab78..19603efc8fa2 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -711,6 +711,29 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel } break; + case HID_UP_TELEPHONY: + switch (usage->hid & HID_USAGE) { + case 0x2f: map_key_clear(KEY_MICMUTE); break; + case 0xb0: map_key_clear(KEY_NUMERIC_0); break; + case 0xb1: map_key_clear(KEY_NUMERIC_1); break; + case 0xb2: map_key_clear(KEY_NUMERIC_2); break; + case 0xb3: map_key_clear(KEY_NUMERIC_3); break; + case 0xb4: map_key_clear(KEY_NUMERIC_4); break; + case 0xb5: map_key_clear(KEY_NUMERIC_5); break; + case 0xb6: map_key_clear(KEY_NUMERIC_6); break; + case 0xb7: map_key_clear(KEY_NUMERIC_7); break; + case 0xb8: map_key_clear(KEY_NUMERIC_8); break; + case 0xb9: map_key_clear(KEY_NUMERIC_9); break; + case 0xba: map_key_clear(KEY_NUMERIC_STAR); break; + case 0xbb: map_key_clear(KEY_NUMERIC_POUND); break; + case 0xbc: map_key_clear(KEY_NUMERIC_A); break; + case 0xbd: map_key_clear(KEY_NUMERIC_B); break; + case 0xbe: map_key_clear(KEY_NUMERIC_C); break; + case 0xbf: map_key_clear(KEY_NUMERIC_D); break; + default: goto ignore; + } + break; + case HID_UP_CONSUMER: /* USB HUT v1.12, pages 75-84 */ switch (usage->hid & HID_USAGE) { case 0x000: goto ignore; diff --git a/include/linux/hid.h b/include/linux/hid.h index efc7787a41a8..69f9cf7f078d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -159,6 +159,7 @@ struct hid_item { #define HID_UP_LED 0x00080000 #define HID_UP_BUTTON 0x00090000 #define HID_UP_ORDINAL 0x000a0000 +#define HID_UP_TELEPHONY 0x000b0000 #define HID_UP_CONSUMER 0x000c0000 #define HID_UP_DIGITIZER 0x000d0000 #define HID_UP_PID 0x000f0000 diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index b0a813079852..2b628c316882 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -702,6 +702,10 @@ struct input_keymap_entry { #define KEY_NUMERIC_9 0x209 #define KEY_NUMERIC_STAR 0x20a #define KEY_NUMERIC_POUND 0x20b +#define KEY_NUMERIC_A 0x20c /* Phone key A - HUT Telephony 0xb9 */ +#define KEY_NUMERIC_B 0x20d +#define KEY_NUMERIC_C 0x20e +#define KEY_NUMERIC_D 0x20f #define KEY_CAMERA_FOCUS 0x210 #define KEY_WPS_BUTTON 0x211 /* WiFi Protected Setup key */ -- cgit v1.2.3 From 842a9ae08a25671db3d4f689eed68b4d64be15b5 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 4 Mar 2015 12:54:21 +0200 Subject: bridge: Extend Proxy ARP design to allow optional rules for Wi-Fi This extends the design in commit 958501163ddd ("bridge: Add support for IEEE 802.11 Proxy ARP") with optional set of rules that are needed to meet the IEEE 802.11 and Hotspot 2.0 requirements for ProxyARP. The previously added BR_PROXYARP behavior is left as-is and a new BR_PROXYARP_WIFI alternative is added so that this behavior can be configured from user space when required. In addition, this enables proxyarp functionality for unicast ARP requests for both BR_PROXYARP and BR_PROXYARP_WIFI since it is possible to use unicast as well as broadcast for these frames. The key differences in functionality: BR_PROXYARP: - uses the flag on the bridge port on which the request frame was received to determine whether to reply - block bridge port flooding completely on ports that enable proxy ARP BR_PROXYARP_WIFI: - uses the flag on the bridge port to which the target device of the request belongs - block bridge port flooding selectively based on whether the proxyarp functionality replied Signed-off-by: Jouni Malinen Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 3 +++ net/bridge/br_input.c | 17 ++++++++++------- net/bridge/br_netlink.c | 5 ++++- net/bridge/br_private.h | 1 + net/bridge/br_sysfs_if.c | 2 ++ 7 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index a57bca2ea97e..dad8b00beed2 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -44,6 +44,7 @@ struct br_ip_list { #define BR_PROMISC BIT(7) #define BR_PROXYARP BIT(8) #define BR_LEARNING_SYNC BIT(9) +#define BR_PROXYARP_WIFI BIT(10) extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index dfd0bb22e554..756436e1ce89 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -247,6 +247,7 @@ enum { IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ IFLA_BRPORT_PROXYARP, /* proxy ARP */ IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index f96933a823e3..1238fabff874 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -188,6 +188,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; + if ((p->flags & BR_PROXYARP_WIFI) && + BR_INPUT_SKB_CB(skb)->proxyarp_replied) + continue; prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index e2aa7be3a847..052c5ebbc947 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -60,7 +60,7 @@ static int br_pass_frame_up(struct sk_buff *skb) } static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, - u16 vid) + u16 vid, struct net_bridge_port *p) { struct net_device *dev = br->dev; struct neighbour *n; @@ -68,6 +68,8 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, u8 *arpptr, *sha; __be32 sip, tip; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + if (dev->flags & IFF_NOARP) return; @@ -105,9 +107,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, } f = __br_fdb_get(br, n->ha, vid); - if (f) + if (f && ((p->flags & BR_PROXYARP) || + (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, sha, n->ha, sha); + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } neigh_release(n); } @@ -153,12 +158,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; - if (is_broadcast_ether_addr(dest)) { - if (IS_ENABLED(CONFIG_INET) && - p->flags & BR_PROXYARP && - skb->protocol == htons(ETH_P_ARP)) - br_do_proxy_arp(skb, br, vid); + if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) + br_do_proxy_arp(skb, br, vid, p); + if (is_broadcast_ether_addr(dest)) { skb2 = skb; unicast = false; } else if (is_multicast_ether_addr(dest)) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c72083968768..8bc6b67457dc 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -143,7 +143,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || - nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP))) + nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || + nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, + !!(p->flags & BR_PROXYARP_WIFI))) return -EMSGSIZE; return 0; @@ -553,6 +555,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index de0919975a25..c32e279c62f8 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -305,6 +305,7 @@ struct br_input_skb_cb { #endif u16 frag_max_size; + bool proxyarp_replied; #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 2de5d91199e8..4905845a94e9 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); +BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -215,6 +216,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_fast_leave, #endif &brport_attr_proxyarp, + &brport_attr_proxyarp_wifi, NULL }; -- cgit v1.2.3 From 1cae565e8b746f484f1ff1b71d2a1c89d7cf0668 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 5 Mar 2015 15:05:36 +0100 Subject: netfilter: nf_tables: limit maximum table name length to 32 bytes Set the same as we use for chain names, it should be enough. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 7 ++++--- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 04188b47629d..a143acafa5d9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -535,7 +535,7 @@ struct nft_table { u64 hgenerator; u32 use; u16 flags; - char name[]; + char name[NFT_TABLE_MAXNAMELEN]; }; /** diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 832bc46db78b..b9783931503b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1,6 +1,7 @@ #ifndef _LINUX_NF_TABLES_H #define _LINUX_NF_TABLES_H +#define NFT_TABLE_MAXNAMELEN 32 #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 199fd0f27b0e..284b20ce566b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -401,7 +401,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { - [NFTA_TABLE_NAME] = { .type = NLA_STRING }, + [NFTA_TABLE_NAME] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; @@ -686,13 +687,13 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (!try_module_get(afi->owner)) return -EAFNOSUPPORT; - table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); + table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) { module_put(afi->owner); return -ENOMEM; } - nla_strlcpy(table->name, name, nla_len(name)); + nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; -- cgit v1.2.3 From d0f91938bede204a343473792529e0db7d599836 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 5 Mar 2015 10:23:49 +0100 Subject: tipc: add ip/udp media type The ip/udp bearer can be configured in a point-to-point mode by specifying both local and remote ip/hostname, or it can be enabled in multicast mode, where links are established to all tipc nodes that have joined the same multicast group. The multicast IP address is generated based on the TIPC network ID, but can be overridden by using another multicast address as remote ip. Signed-off-by: Erik Hugne Reviewed-by: Jon Maloy Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 9 + net/tipc/Kconfig | 8 + net/tipc/Makefile | 1 + net/tipc/bearer.c | 13 +- net/tipc/bearer.h | 12 +- net/tipc/msg.h | 2 +- net/tipc/udp_media.c | 442 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 479 insertions(+), 8 deletions(-) create mode 100644 net/tipc/udp_media.c (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 8d723824ad69..d4c8f142ba63 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -83,11 +83,20 @@ enum { TIPC_NLA_BEARER_NAME, /* string */ TIPC_NLA_BEARER_PROP, /* nest */ TIPC_NLA_BEARER_DOMAIN, /* u32 */ + TIPC_NLA_BEARER_UDP_OPTS, /* nest */ __TIPC_NLA_BEARER_MAX, TIPC_NLA_BEARER_MAX = __TIPC_NLA_BEARER_MAX - 1 }; +enum { + TIPC_NLA_UDP_UNSPEC, + TIPC_NLA_UDP_LOCAL, /* sockaddr_storage */ + TIPC_NLA_UDP_REMOTE, /* sockaddr_storage */ + + __TIPC_NLA_UDP_MAX, + TIPC_NLA_UDP_MAX = __TIPC_NLA_UDP_MAX - 1 +}; /* Socket info */ enum { TIPC_NLA_SOCK_UNSPEC, diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index 91c8a8e031db..c25a3a149dc4 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -26,3 +26,11 @@ config TIPC_MEDIA_IB help Saying Y here will enable support for running TIPC on IP-over-InfiniBand devices. +config TIPC_MEDIA_UDP + bool "IP/UDP media type support" + depends on TIPC + select NET_UDP_TUNNEL + help + Saying Y here will enable support for running TIPC over IP/UDP + bool + default y diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 599b1a540d2b..57e460be4692 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -10,5 +10,6 @@ tipc-y += addr.o bcast.o bearer.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ server.o socket.o +tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index af6deeb397a8..840db89e4283 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -47,6 +47,9 @@ static struct tipc_media * const media_info_array[] = { ð_media_info, #ifdef CONFIG_TIPC_MEDIA_IB &ib_media_info, +#endif +#ifdef CONFIG_TIPC_MEDIA_UDP + &udp_media_info, #endif NULL }; @@ -216,7 +219,8 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) * tipc_enable_bearer - enable bearer with the given name */ static int tipc_enable_bearer(struct net *net, const char *name, - u32 disc_domain, u32 priority) + u32 disc_domain, u32 priority, + struct nlattr *attr[]) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b_ptr; @@ -304,7 +308,7 @@ restart: strcpy(b_ptr->name, name); b_ptr->media = m_ptr; - res = m_ptr->enable_media(net, b_ptr); + res = m_ptr->enable_media(net, b_ptr, attr); if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); @@ -372,7 +376,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, kfree_rcu(b_ptr, rcu); } -int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b) +int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, + struct nlattr *attr[]) { struct net_device *dev; char *driver_name = strchr((const char *)b->name, ':') + 1; @@ -791,7 +796,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) } rtnl_lock(); - err = tipc_enable_bearer(net, bearer, domain, prio); + err = tipc_enable_bearer(net, bearer, domain, prio, attrs); if (err) { rtnl_unlock(); return err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 097aff08ad5b..5cad243ee8fc 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -41,7 +41,7 @@ #include #define MAX_BEARERS 2 -#define MAX_MEDIA 2 +#define MAX_MEDIA 3 #define MAX_NODES 4096 #define WSIZE 32 @@ -59,6 +59,7 @@ */ #define TIPC_MEDIA_TYPE_ETH 1 #define TIPC_MEDIA_TYPE_IB 2 +#define TIPC_MEDIA_TYPE_UDP 3 /** * struct tipc_node_map - set of node identifiers @@ -104,7 +105,8 @@ struct tipc_media { int (*send_msg)(struct net *net, struct sk_buff *buf, struct tipc_bearer *b_ptr, struct tipc_media_addr *dest); - int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr); + int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr, + struct nlattr *attr[]); void (*disable_media)(struct tipc_bearer *b_ptr); int (*addr2str)(struct tipc_media_addr *addr, char *strbuf, @@ -183,6 +185,9 @@ extern struct tipc_media eth_media_info; #ifdef CONFIG_TIPC_MEDIA_IB extern struct tipc_media ib_media_info; #endif +#ifdef CONFIG_TIPC_MEDIA_UDP +extern struct tipc_media udp_media_info; +#endif int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); @@ -197,7 +202,8 @@ int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); -int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b); +int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, + struct nlattr *attrs[]); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index c1cc8d7a5d52..fa167846d1ab 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -87,7 +87,7 @@ struct plist; * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields * are word aligned for quicker access */ -#define BUF_HEADROOM LL_MAX_HEADER +#define BUF_HEADROOM (LL_MAX_HEADER + 48) struct tipc_skb_cb { void *handle; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c new file mode 100644 index 000000000000..0d10001db40d --- /dev/null +++ b/net/tipc/udp_media.c @@ -0,0 +1,442 @@ +/* net/tipc/udp_media.c: IP bearer support for TIPC + * + * Copyright (c) 2015, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core.h" +#include "bearer.h" + +/* IANA assigned UDP port */ +#define UDP_PORT_DEFAULT 6118 + +static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { + [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, + [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, + [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, +}; + +/** + * struct udp_media_addr - IP/UDP addressing information + * + * This is the bearer level originating address used in neighbor discovery + * messages, and all fields should be in network byte order + */ +struct udp_media_addr { + __be16 proto; + __be16 udp_port; + union { + struct in_addr ipv4; + struct in6_addr ipv6; + }; +}; + +/** + * struct udp_bearer - ip/udp bearer data structure + * @bearer: associated generic tipc bearer + * @ubsock: bearer associated socket + * @ifindex: local address scope + * @work: used to schedule deferred work on a bearer + */ +struct udp_bearer { + struct tipc_bearer __rcu *bearer; + struct socket *ubsock; + u32 ifindex; + struct work_struct work; +}; + +/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ +static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, + struct udp_media_addr *ua) +{ + memset(addr, 0, sizeof(struct tipc_media_addr)); + addr->media_id = TIPC_MEDIA_TYPE_UDP; + memcpy(addr->value, ua, sizeof(struct udp_media_addr)); + if (ntohs(ua->proto) == ETH_P_IP) { + if (ipv4_is_multicast(ua->ipv4.s_addr)) + addr->broadcast = 1; + } else if (ntohs(ua->proto) == ETH_P_IPV6) { + if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST) + addr->broadcast = 1; + } else { + pr_err("Invalid UDP media address\n"); + } +} + +/* tipc_udp_addr2str - convert ip/udp address to string */ +static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) +{ + struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; + + if (ntohs(ua->proto) == ETH_P_IP) + snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port)); + else if (ntohs(ua->proto) == ETH_P_IPV6) + snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port)); + else + pr_err("Invalid UDP media address\n"); + return 0; +} + +/* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */ +static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a, + char *msg) +{ + struct udp_media_addr *ua; + + ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET); + if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP) + return -EINVAL; + tipc_udp_media_addr_set(a, ua); + return 0; +} + +/* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */ +static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) +{ + memset(msg, 0, TIPC_MEDIA_INFO_SIZE); + msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP; + memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value, + sizeof(struct udp_media_addr)); + return 0; +} + +/* tipc_send_msg - enqueue a send request */ +static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dest) +{ + int ttl, err = 0; + struct udp_bearer *ub; + struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value; + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct sk_buff *clone; + struct rtable *rt; + + clone = skb_clone(skb, GFP_ATOMIC); + skb_set_inner_protocol(clone, htons(ETH_P_TIPC)); + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + err = -ENODEV; + goto tx_error; + } + if (htons(dst->proto) == ETH_P_IP) { + struct flowi4 fl = { + .daddr = dst->ipv4.s_addr, + .saddr = src->ipv4.s_addr, + .flowi4_mark = clone->mark, + .flowi4_proto = IPPROTO_UDP + }; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto tx_error; + } + ttl = ip4_dst_hoplimit(&rt->dst); + err = udp_tunnel_xmit_skb(rt, clone, src->ipv4.s_addr, + dst->ipv4.s_addr, 0, ttl, 0, + src->udp_port, dst->udp_port, + false, true); + if (err < 0) { + ip_rt_put(rt); + goto tx_error; + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct dst_entry *ndst; + struct flowi6 fl6 = { + .flowi6_oif = ub->ifindex, + .daddr = dst->ipv6, + .saddr = src->ipv6, + .flowi6_proto = IPPROTO_UDP + }; + err = ip6_dst_lookup(ub->ubsock->sk, &ndst, &fl6); + if (err) + goto tx_error; + ttl = ip6_dst_hoplimit(ndst); + err = udp_tunnel6_xmit_skb(ndst, clone, ndst->dev, &src->ipv6, + &dst->ipv6, 0, ttl, src->udp_port, + dst->udp_port, false); +#endif + } + return err; + +tx_error: + kfree_skb(clone); + return err; +} + +/* tipc_udp_recv - read data from bearer socket */ +static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_bearer *ub; + struct tipc_bearer *b; + + ub = rcu_dereference_sk_user_data(sk); + if (!ub) { + pr_err_ratelimited("Failed to get UDP bearer reference"); + kfree_skb(skb); + return 0; + } + + skb_pull(skb, sizeof(struct udphdr)); + rcu_read_lock(); + b = rcu_dereference_rtnl(ub->bearer); + + if (b) { + tipc_rcv(sock_net(sk), skb, b); + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + +static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) +{ + int err = 0; + struct ip_mreqn mreqn; + struct sock *sk = ub->ubsock->sk; + + if (ntohs(remote->proto) == ETH_P_IP) { + if (!ipv4_is_multicast(remote->ipv4.s_addr)) + return 0; + mreqn.imr_multiaddr = remote->ipv4; + mreqn.imr_ifindex = ub->ifindex; + err = __ip_mc_join_group(sk, &mreqn); + } else { + if (!ipv6_addr_is_multicast(&remote->ipv6)) + return 0; + err = __ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); + } + return err; +} + +/** + * parse_options - build local/remote addresses from configuration + * @attrs: netlink config data + * @ub: UDP bearer instance + * @local: local bearer IP address/port + * @remote: peer or multicast IP/port + */ +static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, + struct udp_media_addr *local, + struct udp_media_addr *remote) +{ + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + struct sockaddr_storage *sa_local, *sa_remote; + + if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) + goto err; + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, + attrs[TIPC_NLA_BEARER_UDP_OPTS], + tipc_nl_udp_policy)) + goto err; + if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { + sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]); + sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]); + } else { +err: + pr_err("Invalid UDP bearer configuration"); + return -EINVAL; + } + if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) { + struct sockaddr_in *ip4; + + ip4 = (struct sockaddr_in *)sa_local; + local->proto = htons(ETH_P_IP); + local->udp_port = ip4->sin_port; + local->ipv4.s_addr = ip4->sin_addr.s_addr; + + ip4 = (struct sockaddr_in *)sa_remote; + remote->proto = htons(ETH_P_IP); + remote->udp_port = ip4->sin_port; + remote->ipv4.s_addr = ip4->sin_addr.s_addr; + return 0; + +#if IS_ENABLED(CONFIG_IPV6) + } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) { + struct sockaddr_in6 *ip6; + + ip6 = (struct sockaddr_in6 *)sa_local; + local->proto = htons(ETH_P_IPV6); + local->udp_port = ip6->sin6_port; + local->ipv6 = ip6->sin6_addr; + ub->ifindex = ip6->sin6_scope_id; + + ip6 = (struct sockaddr_in6 *)sa_remote; + remote->proto = htons(ETH_P_IPV6); + remote->udp_port = ip6->sin6_port; + remote->ipv6 = ip6->sin6_addr; + return 0; +#endif + } + return -EADDRNOTAVAIL; +} + +/** + * tipc_udp_enable - callback to create a new udp bearer instance + * @net: network namespace + * @b: pointer to generic tipc_bearer + * @attrs: netlink bearer configuration + * + * validate the bearer parameters and initialize the udp bearer + * rtnl_lock should be held + */ +static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, + struct nlattr *attrs[]) +{ + int err = -EINVAL; + struct udp_bearer *ub; + struct udp_media_addr *remote; + struct udp_media_addr local = {0}; + struct udp_port_cfg udp_conf = {0}; + struct udp_tunnel_sock_cfg tuncfg = {0}; + + ub = kzalloc(sizeof(*ub), GFP_ATOMIC); + if (!ub) + return -ENOMEM; + + remote = (struct udp_media_addr *)&b->bcast_addr.value; + memset(remote, 0, sizeof(struct udp_media_addr)); + err = parse_options(attrs, ub, &local, remote); + if (err) + goto err; + + b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; + b->bcast_addr.broadcast = 1; + rcu_assign_pointer(b->media_ptr, ub); + rcu_assign_pointer(ub->bearer, b); + tipc_udp_media_addr_set(&b->addr, &local); + if (htons(local.proto) == ETH_P_IP) { + struct net_device *dev; + + dev = __ip_dev_find(net, local.ipv4.s_addr, false); + if (!dev) { + err = -ENODEV; + goto err; + } + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + udp_conf.use_udp_checksums = false; + ub->ifindex = dev->ifindex; + b->mtu = dev->mtu - sizeof(struct iphdr) + - sizeof(struct udphdr); +#if IS_ENABLED(CONFIG_IPV6) + } else if (htons(local.proto) == ETH_P_IPV6) { + udp_conf.family = AF_INET6; + udp_conf.use_udp6_tx_checksums = true; + udp_conf.use_udp6_rx_checksums = true; + udp_conf.local_ip6 = in6addr_any; + b->mtu = 1280; +#endif + } else { + err = -EAFNOSUPPORT; + goto err; + } + udp_conf.local_udp_port = local.udp_port; + err = udp_sock_create(net, &udp_conf, &ub->ubsock); + if (err) + goto err; + tuncfg.sk_user_data = ub; + tuncfg.encap_type = 1; + tuncfg.encap_rcv = tipc_udp_recv; + tuncfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); + + if (enable_mcast(ub, remote)) + goto err; + return 0; +err: + kfree(ub); + return err; +} + +/* cleanup_bearer - break the socket/bearer association */ +static void cleanup_bearer(struct work_struct *work) +{ + struct udp_bearer *ub = container_of(work, struct udp_bearer, work); + + if (ub->ubsock) + udp_tunnel_sock_release(ub->ubsock); + synchronize_net(); + kfree(ub); +} + +/* tipc_udp_disable - detach bearer from socket */ +static void tipc_udp_disable(struct tipc_bearer *b) +{ + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + pr_err("UDP bearer instance not found\n"); + return; + } + if (ub->ubsock) + sock_set_flag(ub->ubsock->sk, SOCK_DEAD); + RCU_INIT_POINTER(b->media_ptr, NULL); + RCU_INIT_POINTER(ub->bearer, NULL); + + /* sock_release need to be done outside of rtnl lock */ + INIT_WORK(&ub->work, cleanup_bearer); + schedule_work(&ub->work); +} + +struct tipc_media udp_media_info = { + .send_msg = tipc_udp_send_msg, + .enable_media = tipc_udp_enable, + .disable_media = tipc_udp_disable, + .addr2str = tipc_udp_addr2str, + .addr2msg = tipc_udp_addr2msg, + .msg2addr = tipc_udp_msg2addr, + .priority = TIPC_DEF_LINK_PRI, + .tolerance = TIPC_DEF_LINK_TOL, + .window = TIPC_DEF_LINK_WIN, + .type_id = TIPC_MEDIA_TYPE_UDP, + .hwaddr_len = 0, + .name = "udp" +}; -- cgit v1.2.3 From 37ed9493699cc5dafe1b8725858ef73176fdc9d2 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 5 Mar 2015 21:21:13 -0800 Subject: rtnetlink: add RTNH_F_EXTERNAL flag for fib offload Add new RTNH_F_EXTERNAL flag to mark fib entries offloaded externally, for example to a switchdev switch device. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 06f75a407f74..c3722b024e73 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -334,6 +334,7 @@ struct rtnexthop { #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_EXTERNAL 8 /* Route installed externally */ /* Macros to handle hexthops */ -- cgit v1.2.3 From 05050753602626ed4c46271c689929b625f409e7 Mon Sep 17 00:00:00 2001 From: Ilan peer Date: Wed, 4 Mar 2015 00:32:06 -0500 Subject: cfg80211: Add API to change the indoor regulatory setting Previously, the indoor setting configuration assumed that as long as a station interface is connected, the indoor environment setting does not change. However, this assumption is problematic as: - It is possible that a station interface is connected to a mobile AP, e.g., softAP or a P2P GO, where it is possible that both the station and the mobile AP move out of the indoor environment making the indoor setting invalid. In such a case, user space has no way to invalidate the setting. - A station interface disconnection does not necessarily imply that the device is no longer operating in an indoor environment, e.g., it is possible that the station interface is roaming but is still stays indoor. To handle the above, extend the indoor configuration API to allow user space to indicate a change of indoor settings, and allow it to indicate weather it controls the indoor setting, such that: 1. If the user space process explicitly indicates that it is going to control the indoor setting, do not clear the indoor setting internally, unless the socket is released. The user space process should use the NL80211_ATTR_SOCKET_OWNER attribute in the command to state that it is going to control the indoor setting. 2. Reset the indoor setting when restoring the regulatory settings in case it is not owned by a user space process. Based on the above, a user space tool that continuously monitors the indoor settings, i.e., tracking power setting, location etc., can indicate environment changes to the regulatory core. It should be noted that currently user space is the only provided mechanism used to hint to the regulatory core over the indoor/outdoor environment -- while the country IEs do have an environment setting this has been completely ignored by the regulatory core by design for a while now since country IEs typically can contain bogus data. Acked-by: Luis R. Rodriguez Signed-off-by: ArikX Nemtsov Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 9 +++++++ net/wireless/nl80211.c | 19 ++++++++++++++- net/wireless/reg.c | 57 ++++++++++++++++++++++++++++++++++++++++---- net/wireless/reg.h | 15 +++++++++++- 4 files changed, 94 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 37e7f39441e5..ae16ba9cb1e3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1697,6 +1697,10 @@ enum nl80211_commands { * If set during scheduled scan start then the new scan req will be * owned by the netlink socket that created it and the scheduled scan will * be stopped when the socket is closed. + * If set during configuration of regulatory indoor operation then the + * regulatory indoor configuration would be owned by the netlink socket + * that configured the indoor setting, and the indoor operation would be + * cleared when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. @@ -1752,6 +1756,9 @@ enum nl80211_commands { * * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before a scheduled scan (or a * WoWLAN net-detect scan) is started, u32 in seconds. + + * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device + * is operating in an indoor environment. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2120,6 +2127,8 @@ enum nl80211_attrs { NL80211_ATTR_SCHED_SCAN_DELAY, + NL80211_ATTR_REG_INDOOR, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 07cef3d7653e..b02085301785 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -399,6 +399,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, + [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -4958,7 +4959,10 @@ static int parse_reg_rule(struct nlattr *tb[], static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) { char *data = NULL; + bool is_indoor; enum nl80211_user_reg_hint_type user_reg_hint_type; + u32 owner_nlportid; + /* * You should only get this when cfg80211 hasn't yet initialized @@ -4984,7 +4988,15 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); return regulatory_hint_user(data, user_reg_hint_type); case NL80211_USER_REG_HINT_INDOOR: - return regulatory_hint_indoor_user(); + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + owner_nlportid = info->snd_portid; + is_indoor = !!info->attrs[NL80211_ATTR_REG_INDOOR]; + } else { + owner_nlportid = 0; + is_indoor = true; + } + + return regulatory_hint_indoor(is_indoor, owner_nlportid); default: return -EINVAL; } @@ -12810,6 +12822,11 @@ static int nl80211_netlink_notify(struct notifier_block * nb, rcu_read_unlock(); + /* + * It is possible that the user space process that is controlling the + * indoor setting disappeared, so notify the regulatory core. + */ + regulatory_netlink_notify(notify->portid); return NOTIFY_OK; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index c24c8bf3c988..4239dd408137 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -128,9 +128,12 @@ static int reg_num_devs_support_basehint; * State variable indicating if the platform on which the devices * are attached is operating in an indoor environment. The state variable * is relevant for all registered devices. - * (protected by RTNL) */ static bool reg_is_indoor; +static spinlock_t reg_indoor_lock; + +/* Used to track the userspace process controlling the indoor setting */ +static u32 reg_is_indoor_portid; static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { @@ -2288,15 +2291,50 @@ int regulatory_hint_user(const char *alpha2, return 0; } -int regulatory_hint_indoor_user(void) +int regulatory_hint_indoor(bool is_indoor, u32 portid) { + spin_lock(®_indoor_lock); + + /* It is possible that more than one user space process is trying to + * configure the indoor setting. To handle such cases, clear the indoor + * setting in case that some process does not think that the device + * is operating in an indoor environment. In addition, if a user space + * process indicates that it is controlling the indoor setting, save its + * portid, i.e., make it the owner. + */ + reg_is_indoor = is_indoor; + if (reg_is_indoor) { + if (!reg_is_indoor_portid) + reg_is_indoor_portid = portid; + } else { + reg_is_indoor_portid = 0; + } + spin_unlock(®_indoor_lock); - reg_is_indoor = true; + if (!is_indoor) + reg_check_channels(); return 0; } +void regulatory_netlink_notify(u32 portid) +{ + spin_lock(®_indoor_lock); + + if (reg_is_indoor_portid != portid) { + spin_unlock(®_indoor_lock); + return; + } + + reg_is_indoor = false; + reg_is_indoor_portid = 0; + + spin_unlock(®_indoor_lock); + + reg_check_channels(); +} + /* Driver hints */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { @@ -2464,7 +2502,17 @@ static void restore_regulatory_settings(bool reset_user) ASSERT_RTNL(); - reg_is_indoor = false; + /* + * Clear the indoor setting in case that it is not controlled by user + * space, as otherwise there is no guarantee that the device is still + * operating in an indoor environment. + */ + spin_lock(®_indoor_lock); + if (reg_is_indoor && !reg_is_indoor_portid) { + reg_is_indoor = false; + reg_check_channels(); + } + spin_unlock(®_indoor_lock); reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); @@ -3061,6 +3109,7 @@ int __init regulatory_init(void) spin_lock_init(®_requests_lock); spin_lock_init(®_pending_beacons_lock); + spin_lock_init(®_indoor_lock); reg_regdb_size_check(); diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 4b45d6e61d24..a2c4e16459da 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -25,7 +25,20 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy); int regulatory_hint_user(const char *alpha2, enum nl80211_user_reg_hint_type user_reg_hint_type); -int regulatory_hint_indoor_user(void); + +/** + * regulatory_hint_indoor - hint operation in indoor env. or not + * @is_indoor: if true indicates that user space thinks that the + * device is operating in an indoor environment. + * @portid: the netlink port ID on which the hint was given. + */ +int regulatory_hint_indoor(bool is_indoor, u32 portid); + +/** + * regulatory_netlink_notify - notify on released netlink socket + * @portid: the netlink socket port ID + */ +void regulatory_netlink_notify(u32 portid); void wiphy_regulatory_register(struct wiphy *wiphy); void wiphy_regulatory_deregister(struct wiphy *wiphy); -- cgit v1.2.3 From 68c557501b008515cb86c9a36c75f4e82e14a819 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Mon, 9 Jun 2014 10:57:26 -0400 Subject: KVM: s390: Allocate and save/restore vector registers Define and allocate space for both the host and guest views of the vector registers for a given vcpu. The 32 vector registers occupy 128 bits each (512 bytes total), but architecturally are paired with 512 additional bytes of reserved space for future expansion. The kvm_sync_regs structs containing the registers are union'ed with 1024 bytes of padding in the common kvm_run struct. The addition of 1024 bytes of new register information clearly exceeds the existing union, so an expansion of that padding is required. When changing environments, we need to appropriately save and restore the vector registers viewed by both the host and guest, into and out of the sync_regs space. The floating point registers overlay the upper half of vector registers 0-15, so there's a bit of data duplication here that needs to be carefully avoided. Signed-off-by: Eric Farman Reviewed-by: Thomas Huth Acked-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 10 ++++++++++ arch/s390/include/asm/kvm_host.h | 10 +++++++++- arch/s390/include/uapi/asm/kvm.h | 4 ++++ arch/s390/kvm/kvm-s390.c | 39 +++++++++++++++++++++++++++++++++------ include/uapi/linux/kvm.h | 3 ++- 5 files changed, 58 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b112efc816f1..ee47998ec368 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3248,3 +3248,13 @@ All other orders will be handled completely in user space. Only privileged operation exceptions will be checked for in the kernel (or even in the hardware prior to interception). If this capability is not enabled, the old way of handling SIGP orders is used (partially in kernel and user space). + +7.3 KVM_CAP_S390_VECTOR_REGISTERS + +Architectures: s390 +Parameters: none +Returns: 0 on success, negative value on error + +Allows use of the vector registers introduced with z13 processor, and +provides for the synchronization between host and user space. Will +return -EINVAL if the machine does not support vectors. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index f407bbf5ee94..3449a388b169 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -183,11 +183,17 @@ struct kvm_s390_itdb { __u8 data[256]; } __packed; +struct kvm_s390_vregs { + __vector128 vrs[32]; + __u8 reserved200[512]; /* for future vector expansion */ +} __packed; + struct sie_page { struct kvm_s390_sie_block sie_block; __u8 reserved200[1024]; /* 0x0200 */ struct kvm_s390_itdb itdb; /* 0x0600 */ - __u8 reserved700[2304]; /* 0x0700 */ + __u8 reserved700[1280]; /* 0x0700 */ + struct kvm_s390_vregs vregs; /* 0x0c00 */ } __packed; struct kvm_vcpu_stat { @@ -465,6 +471,7 @@ struct kvm_vcpu_arch { s390_fp_regs host_fpregs; unsigned int host_acrs[NUM_ACRS]; s390_fp_regs guest_fpregs; + struct kvm_s390_vregs *host_vregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; @@ -551,6 +558,7 @@ struct kvm_arch{ int css_support; int use_irqchip; int use_cmma; + int use_vectors; int user_cpu_state_ctrl; int user_sigp; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 9c77e60b9a26..ef1a5fcc6c66 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -150,6 +150,7 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_CRS (1UL << 3) #define KVM_SYNC_ARCH0 (1UL << 4) #define KVM_SYNC_PFAULT (1UL << 5) +#define KVM_SYNC_VRS (1UL << 6) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ @@ -164,6 +165,9 @@ struct kvm_sync_regs { __u64 pft; /* pfault token [PFAULT] */ __u64 pfs; /* pfault select [PFAULT] */ __u64 pfc; /* pfault compare [PFAULT] */ + __u64 vrs[32][2]; /* vector registers */ + __u8 reserved[512]; /* for future vector expansion */ + __u32 fpc; /* only valid with vector registers */ }; #define KVM_REG_S390_TODPR (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f3517e716fa4..a8fe3ab76d68 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -185,6 +185,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; break; + case KVM_CAP_S390_VECTOR_REGISTERS: + r = MACHINE_HAS_VX; + break; default: r = 0; } @@ -265,6 +268,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.user_sigp = 1; r = 0; break; + case KVM_CAP_S390_VECTOR_REGISTERS: + kvm->arch.use_vectors = MACHINE_HAS_VX; + r = MACHINE_HAS_VX ? 0 : -EINVAL; + break; default: r = -EINVAL; break; @@ -942,6 +949,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.css_support = 0; kvm->arch.use_irqchip = 0; + kvm->arch.use_vectors = 0; kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); @@ -1035,6 +1043,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) KVM_SYNC_CRS | KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT; + if (test_kvm_facility(vcpu->kvm, 129)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; if (kvm_is_ucontrol(vcpu->kvm)) return __kvm_ucontrol_vcpu_init(vcpu); @@ -1045,10 +1055,18 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { save_fp_ctl(&vcpu->arch.host_fpregs.fpc); - save_fp_regs(vcpu->arch.host_fpregs.fprs); + if (vcpu->kvm->arch.use_vectors) + save_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs); + else + save_fp_regs(vcpu->arch.host_fpregs.fprs); save_access_regs(vcpu->arch.host_acrs); - restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - restore_fp_regs(vcpu->arch.guest_fpregs.fprs); + if (vcpu->kvm->arch.use_vectors) { + restore_fp_ctl(&vcpu->run->s.regs.fpc); + restore_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs); + } else { + restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc); + restore_fp_regs(vcpu->arch.guest_fpregs.fprs); + } restore_access_regs(vcpu->run->s.regs.acrs); gmap_enable(vcpu->arch.gmap); atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); @@ -1058,11 +1076,19 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); gmap_disable(vcpu->arch.gmap); - save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - save_fp_regs(vcpu->arch.guest_fpregs.fprs); + if (vcpu->kvm->arch.use_vectors) { + save_fp_ctl(&vcpu->run->s.regs.fpc); + save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs); + } else { + save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); + save_fp_regs(vcpu->arch.guest_fpregs.fprs); + } save_access_regs(vcpu->run->s.regs.acrs); restore_fp_ctl(&vcpu->arch.host_fpregs.fpc); - restore_fp_regs(vcpu->arch.host_fpregs.fprs); + if (vcpu->kvm->arch.use_vectors) + restore_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs); + else + restore_fp_regs(vcpu->arch.host_fpregs.fprs); restore_access_regs(vcpu->arch.host_acrs); } @@ -1196,6 +1222,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + vcpu->arch.host_vregs = &sie_page->vregs; vcpu->arch.sie_block->icpua = id; if (!kvm_is_ucontrol(kvm)) { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 805570650062..82634a492fe0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -324,7 +324,7 @@ struct kvm_run { __u64 kvm_dirty_regs; union { struct kvm_sync_regs regs; - char padding[1024]; + char padding[2048]; } s; }; @@ -760,6 +760,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_ENABLE_HCALL 104 #define KVM_CAP_CHECK_EXTENSION_VM 105 #define KVM_CAP_S390_USER_SIGP 106 +#define KVM_CAP_S390_VECTOR_REGISTERS 107 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From bfd30caebaba7f00c13eec3c538799202d36d12a Mon Sep 17 00:00:00 2001 From: Stefan Brüns Date: Thu, 5 Mar 2015 23:15:07 -0800 Subject: Input: rename KEY_DIRECTION to KEY_ROTATE_DISPLAY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new name better reflects intended usage (but we are keeping the old name as an alias for compatibility). Signed-off-by: Stefan Brüns Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index a1d7e931ab72..eff1cf18031e 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -368,7 +368,8 @@ struct input_keymap_entry { #define KEY_MSDOS 151 #define KEY_COFFEE 152 /* AL Terminal Lock/Screensaver */ #define KEY_SCREENLOCK KEY_COFFEE -#define KEY_DIRECTION 153 +#define KEY_ROTATE_DISPLAY 153 /* Display orientation for e.g. tablets */ +#define KEY_DIRECTION KEY_ROTATE_DISPLAY #define KEY_CYCLEWINDOWS 154 #define KEY_MAIL 155 #define KEY_BOOKMARKS 156 /* AC Bookmarks */ -- cgit v1.2.3 From 647f162b8e7e446c4bade031eb8a1a0a83d3de82 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 1 Mar 2015 10:24:28 -0500 Subject: serial: uapi: Declare all userspace-visible io types ioctl(TIOCGSERIAL|TIOCSSERIAL) report and can change the port->iotype. UART drivers use the UPIO_* definitions, but the uapi header defines parallel values and userspace uses these parallel values for ioctls; thus the userspace values are definitive. Define UPIO_* iotypes in terms of the uapi defines, SERIAL_IO_*; extend the uapi defines to include all values in use by the serial core. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 14 +++++++------- include/uapi/linux/serial.h | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 1094f2d9cadb..d10965f0d8a4 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -143,13 +143,13 @@ struct uart_port { unsigned char iotype; /* io access style */ unsigned char unused1; -#define UPIO_PORT (0) /* 8b I/O port access */ -#define UPIO_HUB6 (1) /* Hub6 ISA card */ -#define UPIO_MEM (2) /* 8b MMIO access */ -#define UPIO_MEM32 (3) /* 32b little endian */ -#define UPIO_AU (4) /* Au1x00 and RT288x type IO */ -#define UPIO_TSI (5) /* Tsi108/109 type IO */ -#define UPIO_MEM32BE (6) /* 32b big endian */ +#define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ +#define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ +#define UPIO_MEM (SERIAL_IO_MEM) /* 8b MMIO access */ +#define UPIO_MEM32 (SERIAL_IO_MEM32) /* 32b little endian */ +#define UPIO_AU (SERIAL_IO_AU) /* Au1x00 and RT288x type IO */ +#define UPIO_TSI (SERIAL_IO_TSI) /* Tsi108/109 type IO */ +#define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h index 5e0d0ed61cf3..25331f9faa76 100644 --- a/include/uapi/linux/serial.h +++ b/include/uapi/linux/serial.h @@ -65,6 +65,10 @@ struct serial_struct { #define SERIAL_IO_PORT 0 #define SERIAL_IO_HUB6 1 #define SERIAL_IO_MEM 2 +#define SERIAL_IO_MEM32 3 +#define SERIAL_IO_AU 4 +#define SERIAL_IO_TSI 5 +#define SERIAL_IO_MEM32BE 6 #define UART_CLEAR_FIFO 0x01 #define UART_USE_FIFO 0x02 -- cgit v1.2.3 From c93682477bd861744589215515a63b81fdbd8948 Mon Sep 17 00:00:00 2001 From: Shani Michaeli Date: Thu, 5 Mar 2015 20:16:11 +0200 Subject: net/dcb: Add IEEE QCN attribute As specified in 802.1Qau spec. Add this optional attribute to the DCB netlink layer. To allow for application to use the new attribute, NIC drivers should implement and register the callbacks ieee_getqcn, ieee_setqcn and ieee_getqcnstats. The QCN attribute holds a set of parameters for management, and a set of statistics to provide informative data on Congestion-Control defined by this spec. Signed-off-by: Shani Michaeli Signed-off-by: Shachar Raindel Signed-off-by: Or Gerlitz Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/dcbnl.h | 3 +++ include/uapi/linux/dcbnl.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ net/dcb/dcbnl.c | 44 ++++++++++++++++++++++++++++--- 3 files changed, 110 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 597b88a94332..207d9ba1f92c 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -49,6 +49,9 @@ struct dcbnl_rtnl_ops { int (*ieee_setets) (struct net_device *, struct ieee_ets *); int (*ieee_getmaxrate) (struct net_device *, struct ieee_maxrate *); int (*ieee_setmaxrate) (struct net_device *, struct ieee_maxrate *); + int (*ieee_getqcn) (struct net_device *, struct ieee_qcn *); + int (*ieee_setqcn) (struct net_device *, struct ieee_qcn *); + int (*ieee_getqcnstats) (struct net_device *, struct ieee_qcn_stats *); int (*ieee_getpfc) (struct net_device *, struct ieee_pfc *); int (*ieee_setpfc) (struct net_device *, struct ieee_pfc *); int (*ieee_getapp) (struct net_device *, struct dcb_app *); diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index e711f20dc522..6497d7933d5b 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -78,6 +78,70 @@ struct ieee_maxrate { __u64 tc_maxrate[IEEE_8021QAZ_MAX_TCS]; }; +enum dcbnl_cndd_states { + DCB_CNDD_RESET = 0, + DCB_CNDD_EDGE, + DCB_CNDD_INTERIOR, + DCB_CNDD_INTERIOR_READY, +}; + +/* This structure contains the IEEE 802.1Qau QCN managed object. + * + *@rpg_enable: enable QCN RP + *@rppp_max_rps: maximum number of RPs allowed for this CNPV on this port + *@rpg_time_reset: time between rate increases if no CNMs received. + * given in u-seconds + *@rpg_byte_reset: transmitted data between rate increases if no CNMs received. + * given in Bytes + *@rpg_threshold: The number of times rpByteStage or rpTimeStage can count + * before RP rate control state machine advances states + *@rpg_max_rate: the maxinun rate, in Mbits per second, + * at which an RP can transmit + *@rpg_ai_rate: The rate, in Mbits per second, + * used to increase rpTargetRate in the RPR_ACTIVE_INCREASE + *@rpg_hai_rate: The rate, in Mbits per second, + * used to increase rpTargetRate in the RPR_HYPER_INCREASE state + *@rpg_gd: Upon CNM receive, flow rate is limited to (Fb/Gd)*CurrentRate. + * rpgGd is given as log2(Gd), where Gd may only be powers of 2 + *@rpg_min_dec_fac: The minimum factor by which the current transmit rate + * can be changed by reception of a CNM. + * value is given as percentage (1-100) + *@rpg_min_rate: The minimum value, in bits per second, for rate to limit + *@cndd_state_machine: The state of the congestion notification domain + * defense state machine, as defined by IEEE 802.3Qau + * section 32.1.1. In the interior ready state, + * the QCN capable hardware may add CN-TAG TLV to the + * outgoing traffic, to specifically identify outgoing + * flows. + */ + +struct ieee_qcn { + __u8 rpg_enable[IEEE_8021QAZ_MAX_TCS]; + __u32 rppp_max_rps[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_time_reset[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_byte_reset[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_threshold[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_max_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_ai_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_hai_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_gd[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_min_dec_fac[IEEE_8021QAZ_MAX_TCS]; + __u32 rpg_min_rate[IEEE_8021QAZ_MAX_TCS]; + __u32 cndd_state_machine[IEEE_8021QAZ_MAX_TCS]; +}; + +/* This structure contains the IEEE 802.1Qau QCN statistics. + * + *@rppp_rp_centiseconds: the number of RP-centiseconds accumulated + * by RPs at this priority level on this Port + *@rppp_created_rps: number of active RPs(flows) that react to CNMs + */ + +struct ieee_qcn_stats { + __u64 rppp_rp_centiseconds[IEEE_8021QAZ_MAX_TCS]; + __u32 rppp_created_rps[IEEE_8021QAZ_MAX_TCS]; +}; + /* This structure contains the IEEE 802.1Qaz PFC managed object * * @pfc_cap: Indicates the number of traffic classes on the local device @@ -334,6 +398,8 @@ enum ieee_attrs { DCB_ATTR_IEEE_PEER_PFC, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_MAXRATE, + DCB_ATTR_IEEE_QCN, + DCB_ATTR_IEEE_QCN_STATS, __DCB_ATTR_IEEE_MAX }; #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 93ea80196f0e..5b21f6f88e97 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -177,6 +177,8 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, + [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, + [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, }; static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = { @@ -1030,7 +1032,7 @@ nla_put_failure: return err; } -/* Handle IEEE 802.1Qaz GET commands. */ +/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *ieee, *app; @@ -1067,6 +1069,32 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) } } + if (ops->ieee_getqcn) { + struct ieee_qcn qcn; + + memset(&qcn, 0, sizeof(qcn)); + err = ops->ieee_getqcn(netdev, &qcn); + if (!err) { + err = nla_put(skb, DCB_ATTR_IEEE_QCN, + sizeof(qcn), &qcn); + if (err) + return -EMSGSIZE; + } + } + + if (ops->ieee_getqcnstats) { + struct ieee_qcn_stats qcn_stats; + + memset(&qcn_stats, 0, sizeof(qcn_stats)); + err = ops->ieee_getqcnstats(netdev, &qcn_stats); + if (!err) { + err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, + sizeof(qcn_stats), &qcn_stats); + if (err) + return -EMSGSIZE; + } + } + if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); @@ -1379,8 +1407,9 @@ int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, } EXPORT_SYMBOL(dcbnl_cee_notify); -/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not - * be completed the entire msg is aborted and error value is returned. +/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. + * If any requested operation can not be completed + * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ @@ -1417,6 +1446,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, goto err; } + if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { + struct ieee_qcn *qcn = + nla_data(ieee[DCB_ATTR_IEEE_QCN]); + + err = ops->ieee_setqcn(netdev, qcn); + if (err) + goto err; + } + if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); -- cgit v1.2.3 From 7e41a9def062167b5405711a42c9ecfd163e31a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 6 Mar 2015 12:50:03 +1030 Subject: virtio_blk: typo fix Now that QEmu reuses linux virtio headers, we noticed a typo in the exported virtio block header. Fix it up. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 3c53eec4ae22..b695ba959186 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -60,7 +60,7 @@ struct virtio_blk_config { __u32 size_max; /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ __u32 seg_max; - /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */ + /* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */ struct virtio_blk_geometry { __u16 cylinders; __u8 heads; -- cgit v1.2.3 From 0fa2a56437d0b7ef5d86eef2778ad3469ca72d5a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 6 Mar 2015 12:50:03 +1030 Subject: virtio_blk: fix comment for virtio 1.0 Fix up comment to match virtio 1.0 logic: virtio_blk_outhdr isn't the first elements anymore, the only requirement is that it comes first in the s/g list. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_blk.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index b695ba959186..19c66fcbab8a 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -119,7 +119,11 @@ struct virtio_blk_config { #define VIRTIO_BLK_T_BARRIER 0x80000000 #endif /* !VIRTIO_BLK_NO_LEGACY */ -/* This is the first element of the read scatter-gather list. */ +/* + * This comes first in the read scatter-gather list. + * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, + * this is the first element of the read scatter-gather list. + */ struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ __virtio32 type; -- cgit v1.2.3 From c78ba6d64c78634a875d1e316676667cabfea256 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Wed, 11 Mar 2015 15:39:21 +0100 Subject: ipv6: expose RFC4191 route preference via rtnetlink This makes it possible to retain the route preference when RAs are handled in userspace. Signed-off-by: Lubomir Rintel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/ipv6/route.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index c3722b024e73..bea910f924dd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -305,6 +305,7 @@ enum rtattr_type_t { RTA_MFC_STATS, RTA_VIA, RTA_NEWDST, + RTA_PREF, __RTA_MAX }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 06fa819c43c9..58c0e6a4d15d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2398,6 +2398,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, + [RTA_PREF] = { .type = NLA_U8 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2405,6 +2406,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + unsigned int pref; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); @@ -2480,6 +2482,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); } + if (tb[RTA_PREF]) { + pref = nla_get_u8(tb[RTA_PREF]); + if (pref != ICMPV6_ROUTER_PREF_LOW && + pref != ICMPV6_ROUTER_PREF_HIGH) + pref = ICMPV6_ROUTER_PREF_MEDIUM; + cfg->fc_flags |= RTF_PREF(pref); + } + err = 0; errout: return err; @@ -2583,7 +2593,8 @@ static inline size_t rt6_nlmsg_size(void) + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) - + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + + nla_total_size(1); /* RTA_PREF */ } static int rt6_fill_node(struct net *net, @@ -2724,6 +2735,9 @@ static int rt6_fill_node(struct net *net, if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From a4994b810d52ccb26de922c8d231fe05d14610d4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 13 Mar 2015 11:59:11 +1030 Subject: uapi/virtio_scsi: allow overriding CDB/SENSE size QEMU wants to use virtio scsi structures with a different VIRTIO_SCSI_CDB_SIZE/VIRTIO_SCSI_SENSE_SIZE, let's add ifdefs to allow overriding them. Keep the old defines under new names: VIRTIO_SCSI_CDB_DEFAULT_SIZE/VIRTIO_SCSI_SENSE_DEFAULT_SIZE, since that's what these values really are: defaults for cdb/sense size fields. Suggested-by: Paolo Bonzini Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_scsi.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_scsi.h b/include/uapi/linux/virtio_scsi.h index 42b9370771b0..cc18ef8825c0 100644 --- a/include/uapi/linux/virtio_scsi.h +++ b/include/uapi/linux/virtio_scsi.h @@ -29,8 +29,16 @@ #include -#define VIRTIO_SCSI_CDB_SIZE 32 -#define VIRTIO_SCSI_SENSE_SIZE 96 +/* Default values of the CDB and sense data size configuration fields */ +#define VIRTIO_SCSI_CDB_DEFAULT_SIZE 32 +#define VIRTIO_SCSI_SENSE_DEFAULT_SIZE 96 + +#ifndef VIRTIO_SCSI_CDB_SIZE +#define VIRTIO_SCSI_CDB_SIZE VIRTIO_SCSI_CDB_DEFAULT_SIZE +#endif +#ifndef VIRTIO_SCSI_SENSE_SIZE +#define VIRTIO_SCSI_SENSE_SIZE VIRTIO_SCSI_SENSE_DEFAULT_SIZE +#endif /* SCSI command request, followed by data-out */ struct virtio_scsi_cmd_req { -- cgit v1.2.3 From 03e69b508b6f7c51743055c9f61d1dfeadf4b635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:16 +0100 Subject: ebpf: add prandom helper for packet sampling This work is similar to commit 4cd3675ebf74 ("filter: added BPF random opcode") and adds a possibility for packet sampling in eBPF. Currently, this is only possible in classic BPF and useful to combine sampling with f.e. packet sockets, possible also with tc. Example function proto-type looks like: u32 (*prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/core.c | 2 ++ kernel/bpf/helpers.c | 12 ++++++++++++ net/core/filter.c | 2 ++ 5 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80f2e0fc3d02..50bf95e29a96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -154,4 +154,6 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_get_prandom_u32_proto; + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3fa1af8a58d7..1c2ca2b477c8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 50603aec766a..c1dbbb5d289b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -661,6 +661,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a3c7701a8b5e..95eb59a045ea 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ */ #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -87,3 +88,14 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 7a4eb7030dba..4344db39af2e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1139,6 +1139,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; default: return NULL; } -- cgit v1.2.3 From c04167ce2ca0ecaeaafef006cb0d65cf01b68e42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:17 +0100 Subject: ebpf: add helper for obtaining current processor id This patch adds the possibility to obtain raw_smp_processor_id() in eBPF. Currently, this is only possible in classic BPF where commit da2033c28226 ("filter: add SKF_AD_RXHASH and SKF_AD_CPU") has added facilities for this. Perhaps most importantly, this would also allow us to track per CPU statistics with eBPF maps, or to implement a poor-man's per CPU data structure through eBPF maps. Example function proto-type looks like: u32 (*smp_processor_id)(void) = (void *)BPF_FUNC_get_smp_processor_id; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ net/core/filter.c | 2 ++ 5 files changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50bf95e29a96..30bfd331882a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,5 +155,6 @@ extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; +extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1c2ca2b477c8..de1f63668daf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ + BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1dbbb5d289b..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -662,6 +662,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 95eb59a045ea..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,7 @@ #include #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -99,3 +100,14 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 4344db39af2e..33310eee6134 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1141,6 +1141,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return NULL; } -- cgit v1.2.3 From 9bac3d6d548e5cc925570b263f35b70a00a00ffd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 13 Mar 2015 11:57:42 -0700 Subject: bpf: allow extended BPF programs access skb fields introduce user accessible mirror of in-kernel 'struct sk_buff': struct __sk_buff { __u32 len; __u32 pkt_type; __u32 mark; __u32 queue_mapping; }; bpf programs can do: int bpf_prog(struct __sk_buff *skb) { __u32 var = skb->pkt_type; which will be compiled to bpf assembler as: dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type) bpf verifier will check validity of access and will convert it to: dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset)) dst_reg &= 7 since skb->pkt_type is a bitfield. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 +- include/uapi/linux/bpf.h | 10 ++++ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 152 ++++++++++++++++++++++++++++++++++++++++++----- net/core/filter.c | 100 +++++++++++++++++++++++++------ 5 files changed, 234 insertions(+), 35 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30bfd331882a..280a315de8d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,6 +103,9 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type); + + u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn); }; struct bpf_prog_type_list { @@ -133,7 +136,7 @@ struct bpf_map *bpf_map_get(struct fd f); void bpf_map_put(struct bpf_map *map); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index de1f63668daf..929545a27546 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,4 +170,14 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 669719ccc9ee..ea75c654af1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -519,7 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(prog, attr); + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6b522496250..c22ebd36fa4b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1620,11 +1620,10 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_LDX uses reserved fields\n"); - return -EINVAL; - } + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1643,6 +1642,29 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + + if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1790,6 +1812,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1881,6 +1910,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + static void free_states(struct verifier_env *env) { struct verifier_state_list *sl, *sln; @@ -1903,13 +2018,13 @@ static void free_states(struct verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; - if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, @@ -1919,7 +2034,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; - env->prog = prog; + env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -1951,7 +2066,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - env->explored_states = kcalloc(prog->len, + env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -1968,6 +2083,10 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -1983,18 +2102,18 @@ skip_full_check: if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); - if (!prog->aux->used_maps) { + if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } - memcpy(prog->aux->used_maps, env->used_maps, + memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); - prog->aux->used_map_cnt = env->used_map_cnt; + env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions @@ -2006,11 +2125,12 @@ free_log_buf: if (log_level) vfree(log_buf); free_env: - if (!prog->aux->used_maps) + if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); + *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; diff --git a/net/core/filter.c b/net/core/filter.c index 33310eee6134..4e9dd0ad0d5b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,10 +150,43 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return prandom_u32(); } +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + } + + return insn - insn_buf; +} + static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; + u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -167,13 +200,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - PKT_TYPE_OFFSET()); - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); -#ifdef __BIG_ENDIAN_BITFIELD - insn++; - *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); -#endif + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: @@ -197,10 +225,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - - *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, mark)); + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: @@ -211,10 +237,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - - *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, queue_mapping)); + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: @@ -1151,13 +1175,55 @@ sk_filter_func_proto(enum bpf_func_id func_id) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { - /* skb fields cannot be accessed yet */ - return false; + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* check bounds */ + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + /* all __sk_buff fields are __u32 */ + if (size != 4) + return false; + + return true; +} + +static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, len): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, len)); + break; + + case offsetof(struct __sk_buff, mark): + return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, pkt_type): + return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, queue_mapping): + return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + } + + return insn - insn_buf; } static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { -- cgit v1.2.3 From 9df85aaa43297cb12dc85155695dd1bfdf94f91d Mon Sep 17 00:00:00 2001 From: Antonios Motakis Date: Mon, 16 Mar 2015 14:08:43 -0600 Subject: vfio: platform: probe to devices on the platform bus Driver to bind to Linux platform devices, and callbacks to discover their resources to be used by the main VFIO PLATFORM code. Signed-off-by: Antonios Motakis Signed-off-by: Baptiste Reynal Reviewed-by: Eric Auger Tested-by: Eric Auger Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform.c | 103 ++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 1 + 2 files changed, 104 insertions(+) create mode 100644 drivers/vfio/platform/vfio_platform.c (limited to 'include/uapi') diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c new file mode 100644 index 000000000000..cef645c83996 --- /dev/null +++ b/drivers/vfio/platform/vfio_platform.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +#include "vfio_platform_private.h" + +#define DRIVER_VERSION "0.10" +#define DRIVER_AUTHOR "Antonios Motakis " +#define DRIVER_DESC "VFIO for platform devices - User Level meta-driver" + +/* probing devices from the linux platform bus */ + +static struct resource *get_platform_resource(struct vfio_platform_device *vdev, + int num) +{ + struct platform_device *dev = (struct platform_device *) vdev->opaque; + int i; + + for (i = 0; i < dev->num_resources; i++) { + struct resource *r = &dev->resource[i]; + + if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) { + if (!num) + return r; + + num--; + } + } + return NULL; +} + +static int get_platform_irq(struct vfio_platform_device *vdev, int i) +{ + struct platform_device *pdev = (struct platform_device *) vdev->opaque; + + return platform_get_irq(pdev, i); +} + +static int vfio_platform_probe(struct platform_device *pdev) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) + return -ENOMEM; + + vdev->opaque = (void *) pdev; + vdev->name = pdev->name; + vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM; + vdev->get_resource = get_platform_resource; + vdev->get_irq = get_platform_irq; + + ret = vfio_platform_probe_common(vdev, &pdev->dev); + if (ret) + kfree(vdev); + + return ret; +} + +static int vfio_platform_remove(struct platform_device *pdev) +{ + struct vfio_platform_device *vdev; + + vdev = vfio_platform_remove_common(&pdev->dev); + if (vdev) { + kfree(vdev); + return 0; + } + + return -EINVAL; +} + +static struct platform_driver vfio_platform_driver = { + .probe = vfio_platform_probe, + .remove = vfio_platform_remove, + .driver = { + .name = "vfio-platform", + .owner = THIS_MODULE, + }, +}; + +module_platform_driver(vfio_platform_driver); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 82889c30f4f5..ea9514b6bb5c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -160,6 +160,7 @@ struct vfio_device_info { __u32 flags; #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ +#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- cgit v1.2.3 From 36fe431f2811fa3b5fed15d272c585d5a47977aa Mon Sep 17 00:00:00 2001 From: Antonios Motakis Date: Mon, 16 Mar 2015 14:08:44 -0600 Subject: vfio: amba: VFIO support for AMBA devices Add support for discovering AMBA devices with VFIO and handle them similarly to Linux platform devices. Signed-off-by: Antonios Motakis Signed-off-by: Baptiste Reynal Reviewed-by: Eric Auger Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 115 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 1 + 2 files changed, 116 insertions(+) create mode 100644 drivers/vfio/platform/vfio_amba.c (limited to 'include/uapi') diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c new file mode 100644 index 000000000000..ff0331f72526 --- /dev/null +++ b/drivers/vfio/platform/vfio_amba.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +#include "vfio_platform_private.h" + +#define DRIVER_VERSION "0.10" +#define DRIVER_AUTHOR "Antonios Motakis " +#define DRIVER_DESC "VFIO for AMBA devices - User Level meta-driver" + +/* probing devices from the AMBA bus */ + +static struct resource *get_amba_resource(struct vfio_platform_device *vdev, + int i) +{ + struct amba_device *adev = (struct amba_device *) vdev->opaque; + + if (i == 0) + return &adev->res; + + return NULL; +} + +static int get_amba_irq(struct vfio_platform_device *vdev, int i) +{ + struct amba_device *adev = (struct amba_device *) vdev->opaque; + int ret = 0; + + if (i < AMBA_NR_IRQS) + ret = adev->irq[i]; + + /* zero is an unset IRQ for AMBA devices */ + return ret ? ret : -ENXIO; +} + +static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) + return -ENOMEM; + + vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid); + if (!vdev->name) { + kfree(vdev); + return -ENOMEM; + } + + vdev->opaque = (void *) adev; + vdev->flags = VFIO_DEVICE_FLAGS_AMBA; + vdev->get_resource = get_amba_resource; + vdev->get_irq = get_amba_irq; + + ret = vfio_platform_probe_common(vdev, &adev->dev); + if (ret) { + kfree(vdev->name); + kfree(vdev); + } + + return ret; +} + +static int vfio_amba_remove(struct amba_device *adev) +{ + struct vfio_platform_device *vdev; + + vdev = vfio_platform_remove_common(&adev->dev); + if (vdev) { + kfree(vdev->name); + kfree(vdev); + return 0; + } + + return -EINVAL; +} + +static struct amba_id pl330_ids[] = { + { 0, 0 }, +}; + +MODULE_DEVICE_TABLE(amba, pl330_ids); + +static struct amba_driver vfio_amba_driver = { + .probe = vfio_amba_probe, + .remove = vfio_amba_remove, + .id_table = pl330_ids, + .drv = { + .name = "vfio-amba", + .owner = THIS_MODULE, + }, +}; + +module_amba_driver(vfio_amba_driver); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ea9514b6bb5c..b57b750c222f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -161,6 +161,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ +#define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- cgit v1.2.3 From 41408c28f283b49202ae374b1c42bc8e9b33a048 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 6 Feb 2015 15:01:21 +0100 Subject: KVM: s390: Add MEMOP ioctls for reading/writing guest memory On s390, we've got to make sure to hold the IPTE lock while accessing logical memory. So let's add an ioctl for reading and writing logical memory to provide this feature for userspace, too. The maximum transfer size of this call is limited to 64kB to prevent that the guest can trigger huge copy_from/to_user transfers. QEMU currently only requests up to one or two pages so far, so 16*4kB seems to be a reasonable limit here. Signed-off-by: Thomas Huth Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 46 ++++++++++++++++++++++++ arch/s390/kvm/gaccess.c | 22 ++++++++++++ arch/s390/kvm/gaccess.h | 2 ++ arch/s390/kvm/kvm-s390.c | 74 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 21 +++++++++++ 5 files changed, 165 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ee47998ec368..281179d92a28 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2716,6 +2716,52 @@ The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination +4.89 KVM_S390_MEM_OP + +Capability: KVM_CAP_S390_MEM_OP +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_mem_op (in) +Returns: = 0 on success, + < 0 on generic error (e.g. -EFAULT or -ENOMEM), + > 0 if an exception occurred while walking the page tables + +Read or write data from/to the logical (virtual) memory of a VPCU. + +Parameters are specified via the following structure: + +struct kvm_s390_mem_op { + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 ar; /* the access register number */ + __u8 reserved[31]; /* should be set to 0 */ +}; + +The type of operation is specified in the "op" field. It is either +KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or +KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The +KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check +whether the corresponding memory access would create an access exception +(without touching the data in the memory at the destination). In case an +access exception occurred while walking the MMU tables of the guest, the +ioctl returns a positive error number to indicate the type of exception. +This exception is also raised directly at the corresponding VCPU if the +flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. + +The start address of the memory region has to be specified in the "gaddr" +field, and the length of the region in the "size" field. "buf" is the buffer +supplied by the userspace application where the read data should be written +to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written +is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL +when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access +register number to be used. + +The "reserved" field is meant for future extensions. It is not used by +KVM with the currently defined set of flags. + 5. The kvm_run structure ------------------------ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index ea38d716e24d..a7559f7207df 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -863,6 +863,28 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, return rc; } +/** + * check_gva_range - test a range of guest virtual addresses for accessibility + */ +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, + unsigned long length, int is_write) +{ + unsigned long gpa; + unsigned long currlen; + int rc = 0; + + ipte_lock(vcpu); + while (length > 0 && !rc) { + currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE)); + rc = guest_translate_address(vcpu, gva, ar, &gpa, is_write); + gva += currlen; + length -= currlen; + } + ipte_unlock(vcpu); + + return rc; +} + /** * kvm_s390_check_low_addr_prot_real - check for low-address protection * @gra: Guest real address diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 835e557dabf4..ef03726cc661 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -157,6 +157,8 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, unsigned long *gpa, int write); +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, + unsigned long length, int is_write); int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, unsigned long len, int write); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 610e90afadf2..b7ecef98b668 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,8 @@ #include "trace.h" #include "trace-s390.h" +#define MEM_OP_MAX_SIZE 65536 /* Maximum transfer size for KVM_S390_MEM_OP */ + #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU struct kvm_stats_debugfs_item debugfs_entries[] = { @@ -177,6 +180,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_SIGP: r = 1; break; + case KVM_CAP_S390_MEM_OP: + r = MEM_OP_MAX_SIZE; + break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; @@ -2185,6 +2191,65 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf = NULL; + int r, srcu_idx; + const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION + | KVM_S390_MEMOP_F_CHECK_ONLY; + + if (mop->flags & ~supported_flags) + return -EINVAL; + + if (mop->size > MEM_OP_MAX_SIZE) + return -E2BIG; + + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (mop->op) { + case KVM_S390_MEMOP_LOGICAL_READ: + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false); + break; + } + r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); + if (r == 0) { + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } + break; + case KVM_S390_MEMOP_LOGICAL_WRITE: + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true); + break; + } + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + break; + } + r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); + break; + default: + r = -EINVAL; + } + + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + + if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) + kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + + vfree(tmpbuf); + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2284,6 +2349,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_S390_MEM_OP: { + struct kvm_s390_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_guest_mem_op(vcpu, &mem_op); + else + r = -EFAULT; + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 82634a492fe0..0e16f2c9f0de 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -365,6 +365,24 @@ struct kvm_translation { __u8 pad[5]; }; +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 ar; /* the access register number */ + __u8 reserved[31]; /* should be set to 0 */ +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) + /* for KVM_INTERRUPT */ struct kvm_interrupt { /* in */ @@ -761,6 +779,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_CHECK_EXTENSION_VM 105 #define KVM_CAP_S390_USER_SIGP 106 #define KVM_CAP_S390_VECTOR_REGISTERS 107 +#define KVM_CAP_S390_MEM_OP 108 #ifdef KVM_CAP_IRQ_ROUTING @@ -1136,6 +1155,8 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init) #define KVM_ARM_PREFERRED_TARGET _IOR(KVMIO, 0xaf, struct kvm_vcpu_init) #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) +/* Available with KVM_CAP_S390_MEM_OP */ +#define KVM_S390_MEM_OP _IOW(KVMIO, 0xb1, struct kvm_s390_mem_op) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -- cgit v1.2.3 From e44fc8c9dab215ac0e398622a05574cffd5f5184 Mon Sep 17 00:00:00 2001 From: Ekaterina Tumanova Date: Fri, 30 Jan 2015 16:55:56 +0100 Subject: KVM: s390: introduce post handlers for STSI The Store System Information (STSI) instruction currently collects all information it relays to the caller in the kernel. Some information, however, is only available in user space. An example of this is the guest name: The kernel always sets "KVMGuest", but user space knows the actual guest name. This patch introduces a new exit, KVM_EXIT_S390_STSI, guarded by a capability that can be enabled by user space if it wants to be able to insert such data. User space will be provided with the target buffer and the requested STSI function code. Reviewed-by: Eric Farman Reviewed-by: Christian Borntraeger Signed-off-by: Ekaterina Tumanova Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 28 ++++++++++++++++++++++++++++ arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 5 +++++ arch/s390/kvm/priv.c | 17 ++++++++++++++++- include/uapi/linux/kvm.h | 11 +++++++++++ 5 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 281179d92a28..c1fcb7a3c125 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3304,3 +3304,31 @@ Returns: 0 on success, negative value on error Allows use of the vector registers introduced with z13 processor, and provides for the synchronization between host and user space. Will return -EINVAL if the machine does not support vectors. + +7.4 KVM_CAP_S390_USER_STSI + +Architectures: s390 +Parameters: none + +This capability allows post-handlers for the STSI instruction. After +initial handling in the kernel, KVM exits to user space with +KVM_EXIT_S390_STSI to allow user space to insert further data. + +Before exiting to userspace, kvm handlers should fill in s390_stsi field of +vcpu->run: +struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; +} s390_stsi; + +@addr - guest address of STSI SYSIB +@fc - function code +@sel1 - selector 1 +@sel2 - selector 2 +@ar - access register number + +KVM handlers should exit to userspace with rc = -EREMOTE. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 347a3333d618..2356a8c660b3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -565,6 +565,7 @@ struct kvm_arch{ int use_vectors; int user_cpu_state_ctrl; int user_sigp; + int user_stsi; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b7ecef98b668..fdfa10662700 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -178,6 +178,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: case KVM_CAP_S390_USER_SIGP: + case KVM_CAP_S390_USER_STSI: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -280,6 +281,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.use_vectors = MACHINE_HAS_VX; r = MACHINE_HAS_VX ? 0 : -EINVAL; break; + case KVM_CAP_S390_USER_STSI: + kvm->arch.user_stsi = 1; + r = 0; + break; default: r = -EINVAL; break; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index f4fe02e84326..5e4658d20c77 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -496,6 +496,17 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) ASCEBC(mem->vm[0].cpi, 16); } +static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar, + u8 fc, u8 sel1, u16 sel2) +{ + vcpu->run->exit_reason = KVM_EXIT_S390_STSI; + vcpu->run->s390_stsi.addr = addr; + vcpu->run->s390_stsi.ar = ar; + vcpu->run->s390_stsi.fc = fc; + vcpu->run->s390_stsi.sel1 = sel1; + vcpu->run->s390_stsi.sel2 = sel2; +} + static int handle_stsi(struct kvm_vcpu *vcpu) { int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; @@ -556,11 +567,15 @@ static int handle_stsi(struct kvm_vcpu *vcpu) rc = kvm_s390_inject_prog_cond(vcpu, rc); goto out; } + if (vcpu->kvm->arch.user_stsi) { + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + rc = -EREMOTE; + } trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); free_page(mem); kvm_s390_set_psw_cc(vcpu, 0); vcpu->run->s.regs.gprs[0] = 0; - return 0; + return rc; out_no_data: kvm_s390_set_psw_cc(vcpu, 3); out: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0e16f2c9f0de..57445ef88097 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -172,6 +172,7 @@ struct kvm_pit_config { #define KVM_EXIT_S390_TSCH 22 #define KVM_EXIT_EPR 23 #define KVM_EXIT_SYSTEM_EVENT 24 +#define KVM_EXIT_S390_STSI 25 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -309,6 +310,15 @@ struct kvm_run { __u32 type; __u64 flags; } system_event; + /* KVM_EXIT_S390_STSI */ + struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; + } s390_stsi; /* Fix the size of the union. */ char padding[256]; }; @@ -780,6 +790,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_SIGP 106 #define KVM_CAP_S390_VECTOR_REGISTERS 107 #define KVM_CAP_S390_MEM_OP 108 +#define KVM_CAP_S390_USER_STSI 109 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 30ee2a984f07b00895e0e01d78859b3aff9307c7 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Tue, 23 Sep 2014 09:23:01 -0400 Subject: KVM: s390: Create ioctl for Getting/Setting guest storage keys Provide the KVM_S390_GET_SKEYS and KVM_S390_SET_SKEYS ioctl which can be used to get/set guest storage keys. This functionality is needed for live migration of s390 guests that use storage keys. Signed-off-by: Jason J. Herne Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 58 ++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 123 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 14 +++++ 3 files changed, 195 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c1fcb7a3c125..0d7fc66289a0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2762,6 +2762,64 @@ register number to be used. The "reserved" field is meant for future extensions. It is not used by KVM with the currently defined set of flags. +4.90 KVM_S390_GET_SKEYS + +Capability: KVM_CAP_S390_SKEYS +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_skeys +Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage + keys, negative value on error + +This ioctl is used to get guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +The start_gfn field is the number of the first guest frame whose storage keys +you want to get. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer large enough to hold count +bytes. This buffer will be filled with storage key data by the ioctl. + +4.91 KVM_S390_SET_SKEYS + +Capability: KVM_CAP_S390_SKEYS +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_skeys +Returns: 0 on success, negative value on error + +This ioctl is used to set guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. +See section on KVM_S390_GET_SKEYS for struct definition. + +The start_gfn field is the number of the first guest frame whose storage keys +you want to set. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer containing count bytes of +storage keys. Each byte in the buffer will be set as the storage key for a +single frame starting at start_gfn for count frames. + +Note: If any architecturally invalid key value is found in the given data then +the ioctl will return -EINVAL. + 5. The kvm_run structure ------------------------ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fdfa10662700..0dc22baa0a57 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -179,6 +179,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MP_STATE: case KVM_CAP_S390_USER_SIGP: case KVM_CAP_S390_USER_STSI: + case KVM_CAP_S390_SKEYS: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -729,6 +730,108 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +{ + uint8_t *keys; + uint64_t hva; + unsigned long curkey; + int i, r = 0; + + if (args->flags != 0) + return -EINVAL; + + /* Is this guest using storage keys? */ + if (!mm_use_skey(current->mm)) + return KVM_S390_GET_SKEYS_NONE; + + /* Enforce sane limit on memory allocation */ + if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) + return -EINVAL; + + keys = kmalloc_array(args->count, sizeof(uint8_t), + GFP_KERNEL | __GFP_NOWARN); + if (!keys) + keys = vmalloc(sizeof(uint8_t) * args->count); + if (!keys) + return -ENOMEM; + + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + goto out; + } + + curkey = get_guest_storage_key(current->mm, hva); + if (IS_ERR_VALUE(curkey)) { + r = curkey; + goto out; + } + keys[i] = curkey; + } + + r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, + sizeof(uint8_t) * args->count); + if (r) + r = -EFAULT; +out: + kvfree(keys); + return r; +} + +static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +{ + uint8_t *keys; + uint64_t hva; + int i, r = 0; + + if (args->flags != 0) + return -EINVAL; + + /* Enforce sane limit on memory allocation */ + if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) + return -EINVAL; + + keys = kmalloc_array(args->count, sizeof(uint8_t), + GFP_KERNEL | __GFP_NOWARN); + if (!keys) + keys = vmalloc(sizeof(uint8_t) * args->count); + if (!keys) + return -ENOMEM; + + r = copy_from_user(keys, (uint8_t __user *)args->skeydata_addr, + sizeof(uint8_t) * args->count); + if (r) { + r = -EFAULT; + goto out; + } + + /* Enable storage key handling for the guest */ + s390_enable_skey(); + + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + goto out; + } + + /* Lowest order bit is reserved */ + if (keys[i] & 0x01) { + r = -EINVAL; + goto out; + } + + r = set_guest_storage_key(current->mm, hva, + (unsigned long)keys[i], 0); + if (r) + goto out; + } +out: + kvfree(keys); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -788,6 +891,26 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_vm_has_attr(kvm, &attr); break; } + case KVM_S390_GET_SKEYS: { + struct kvm_s390_skeys args; + + r = -EFAULT; + if (copy_from_user(&args, argp, + sizeof(struct kvm_s390_skeys))) + break; + r = kvm_s390_get_skeys(kvm, &args); + break; + } + case KVM_S390_SET_SKEYS: { + struct kvm_s390_skeys args; + + r = -EFAULT; + if (copy_from_user(&args, argp, + sizeof(struct kvm_s390_skeys))) + break; + r = kvm_s390_set_skeys(kvm, &args); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 57445ef88097..1162ef7a3fa1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -147,6 +147,16 @@ struct kvm_pit_config { #define KVM_PIT_SPEAKER_DUMMY 1 +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; +#define KVM_S390_GET_SKEYS_NONE 1 +#define KVM_S390_SKEYS_MAX 1048576 + #define KVM_EXIT_UNKNOWN 0 #define KVM_EXIT_EXCEPTION 1 #define KVM_EXIT_IO 2 @@ -791,6 +801,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_VECTOR_REGISTERS 107 #define KVM_CAP_S390_MEM_OP 108 #define KVM_CAP_S390_USER_STSI 109 +#define KVM_CAP_S390_SKEYS 110 #ifdef KVM_CAP_IRQ_ROUTING @@ -1168,6 +1179,9 @@ struct kvm_s390_ucas_mapping { #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) /* Available with KVM_CAP_S390_MEM_OP */ #define KVM_S390_MEM_OP _IOW(KVMIO, 0xb1, struct kvm_s390_mem_op) +/* Available with KVM_CAP_S390_SKEYS */ +#define KVM_S390_GET_SKEYS _IOW(KVMIO, 0xb2, struct kvm_s390_skeys) +#define KVM_S390_SET_SKEYS _IOW(KVMIO, 0xb3, struct kvm_s390_skeys) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -- cgit v1.2.3 From c24973957975403521ca76a776c2dfd12fbe9add Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 16 Mar 2015 18:06:02 -0700 Subject: bpf: allow BPF programs access 'protocol' and 'vlan_tci' fields as a follow on to patch 70006af95515 ("bpf: allow eBPF access skb fields") this patch allows 'protocol' and 'vlan_tci' fields to be accessible from extended BPF programs. The usage of 'protocol', 'vlan_present' and 'vlan_tci' fields is the same as corresponding SKF_AD_PROTOCOL, SKF_AD_VLAN_TAG_PRESENT and SKF_AD_VLAN_TAG accesses in classic BPF. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 ++ net/core/filter.c | 72 +++++++++++++++++++++++++++++++-------------- samples/bpf/test_verifier.c | 9 ++++++ 3 files changed, 62 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 929545a27546..1623047af463 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -178,6 +178,9 @@ struct __sk_buff { __u32 pkt_type; __u32 mark; __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index 4e9dd0ad0d5b..b95ae7fe7e4f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -177,6 +177,35 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; + + case SKF_AD_PROTOCOL: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + + /* dst_reg = *(u16 *) (src_reg + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, protocol)); + /* dst_reg = ntohs(dst_reg) [emitting a nop or swap16] */ + *insn++ = BPF_ENDIAN(BPF_FROM_BE, dst_reg, 16); + break; + + case SKF_AD_VLAN_TAG: + case SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_tci)); + if (skb_field == SKF_AD_VLAN_TAG) { + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, + ~VLAN_TAG_PRESENT); + } else { + /* dst_reg >>= 12 */ + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); + /* dst_reg &= 1 */ + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); + } + break; } return insn - insn_buf; @@ -190,13 +219,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - - /* A = *(u16 *) (CTX + offsetof(protocol)) */ - *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, protocol)); - /* A = ntohs(A) [emitting a nop or swap16] */ - *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + cnt = convert_skb_access(SKF_AD_PROTOCOL, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_PKTTYPE: @@ -242,22 +266,15 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: - case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + cnt = convert_skb_access(SKF_AD_VLAN_TAG, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; - /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ - *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, vlan_tci)); - if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, - ~VLAN_TAG_PRESENT); - } else { - /* A >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); - /* A &= 1 */ - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); - } + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: @@ -1215,6 +1232,17 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, case offsetof(struct __sk_buff, queue_mapping): return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, protocol): + return convert_skb_access(SKF_AD_PROTOCOL, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_present): + return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_tci): + return convert_skb_access(SKF_AD_VLAN_TAG, + dst_reg, src_reg, insn); } return insn - insn_buf; diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index df6dbb6576f6..75d561f9fd6a 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -658,6 +658,15 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, queue_mapping)), BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, protocol)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_present)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_tci)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), }, .result = ACCEPT, -- cgit v1.2.3 From 27cd44618b92fc8c6889e4628407791e45422bac Mon Sep 17 00:00:00 2001 From: Neil Roberts Date: Wed, 4 Mar 2015 14:41:16 +0000 Subject: drm/i915: Add I915_PARAM_REVISION Adds a parameter which can be used with DRM_I915_GETPARAM to query the GPU revision. The intention is to use this in Mesa to implement the WaDisableSIMD16On3SrcInstr workaround on Skylake but only for revision 2. Signed-off-by: Neil Roberts Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_dma.c | 3 +++ include/uapi/drm/i915_drm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 053e1788f578..3b64c526b6b9 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -68,6 +68,9 @@ static int i915_getparam(struct drm_device *dev, void *data, case I915_PARAM_CHIPSET_ID: value = dev->pdev->device; break; + case I915_PARAM_REVISION: + value = dev->pdev->revision; + break; case I915_PARAM_HAS_GEM: value = 1; break; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 6eed16b92a24..b768f3b21eaa 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -347,6 +347,7 @@ typedef struct drm_i915_irq_wait { #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29 #define I915_PARAM_MMAP_VERSION 30 #define I915_PARAM_HAS_BSD2 31 +#define I915_PARAM_REVISION 32 typedef struct drm_i915_getparam { int param; -- cgit v1.2.3 From 8c4f83fb1e8bf317e894f62d17a63c32b7a6b75e Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 11 Mar 2015 08:26:23 +0100 Subject: drm/fourcc: 64 #defines need ULL postfix I have no idea about the exact rules, but this angered Dave's 32bit rhel gcc. Reported-by: Dave Airlie Signed-off-by: Daniel Vetter --- include/uapi/drm/drm_fourcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index e6efac23c7ea..07735822a28f 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -151,7 +151,7 @@ /* add more to the end as needed */ #define fourcc_mod_code(vendor, val) \ - ((((u64)DRM_FORMAT_MOD_VENDOR_## vendor) << 56) | (val & 0x00ffffffffffffffL)) + ((((u64)DRM_FORMAT_MOD_VENDOR_## vendor) << 56) | (val & 0x00ffffffffffffffULL)) /* * Format Modifier tokens: -- cgit v1.2.3 From a1559ffefb2a80eabbee65d7cc04e828d4fd557d Mon Sep 17 00:00:00 2001 From: Jeff McGee Date: Mon, 9 Mar 2015 16:06:54 -0700 Subject: drm/i915: Export total subslice and EU counts Setup new I915_GETPARAM ioctl entries for subslice total and EU total. Userspace drivers need these values when constructing GPGPU commands. This kernel query method is intended to replace the PCI ID-based tables that userspace drivers currently maintain. The kernel driver can employ fuse register reads as needed to ensure the most accurate determination of GT config attributes. This first became important with Cherryview in which the config could differ between devices with the same PCI ID. The kernel detection of these values is device-specific and not included in this patch. Because zero is not a valid value for any of these parameters, a value of zero is interpreted as unknown for the device. Userspace drivers should continue to maintain ID-based tables for older devices not supported by the new query method. v2: Increment our I915_GETPARAM indices to fit after REVISION which was merged ahead of us. For: VIZ-4636 Signed-off-by: Jeff McGee Tested-by: Zhigang Gong Acked-by: Zhigang Gong Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++ include/uapi/drm/i915_drm.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 8e914303b831..d49ed68f041e 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -153,6 +153,16 @@ static int i915_getparam(struct drm_device *dev, void *data, case I915_PARAM_MMAP_VERSION: value = 1; break; + case I915_PARAM_SUBSLICE_TOTAL: + value = INTEL_INFO(dev)->subslice_total; + if (!value) + return -ENODEV; + break; + case I915_PARAM_EU_TOTAL: + value = INTEL_INFO(dev)->eu_total; + if (!value) + return -ENODEV; + break; default: DRM_DEBUG("Unknown parameter %d\n", param->param); return -EINVAL; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index b768f3b21eaa..8d1be9073380 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -348,6 +348,8 @@ typedef struct drm_i915_irq_wait { #define I915_PARAM_MMAP_VERSION 30 #define I915_PARAM_HAS_BSD2 31 #define I915_PARAM_REVISION 32 +#define I915_PARAM_SUBSLICE_TOTAL 33 +#define I915_PARAM_EU_TOTAL 34 typedef struct drm_i915_getparam { int param; -- cgit v1.2.3 From 847aac644e92e5624f2c153bab409bf713d5ff9a Mon Sep 17 00:00:00 2001 From: Li Xi Date: Thu, 19 Mar 2015 04:04:53 +0900 Subject: vfs: Add general support to enforce project quota limits This patch adds support for a new quota type PRJQUOTA for project quota enforcement. Also a new method get_projid() is added into dquot_operations structure. Signed-off-by: Li Xi Signed-off-by: Dmitry Monakhov Reviewed-by: Jan Kara Signed-off-by: Jan Kara --- fs/quota/dquot.c | 20 +++++++++++++++++--- fs/quota/quotaio_v2.h | 6 ++++-- include/linux/quota.h | 2 ++ include/uapi/linux/quota.h | 6 ++++-- 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 327a58448592..ecc25cf0ee6e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1163,8 +1163,8 @@ static int need_print_warning(struct dquot_warn *warn) return uid_eq(current_fsuid(), warn->w_dq_id.uid); case GRPQUOTA: return in_group_p(warn->w_dq_id.gid); - case PRJQUOTA: /* Never taken... Just make gcc happy */ - return 0; + case PRJQUOTA: + return 1; } return 0; } @@ -1405,6 +1405,9 @@ static void __dquot_initialize(struct inode *inode, int type) /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { struct kqid qid; + kprojid_t projid; + int rc; + got[cnt] = NULL; if (type != -1 && cnt != type) continue; @@ -1415,6 +1418,10 @@ static void __dquot_initialize(struct inode *inode, int type) */ if (dquots[cnt]) continue; + + if (!sb_has_quota_active(sb, cnt)) + continue; + init_needed = 1; switch (cnt) { @@ -1424,6 +1431,12 @@ static void __dquot_initialize(struct inode *inode, int type) case GRPQUOTA: qid = make_kqid_gid(inode->i_gid); break; + case PRJQUOTA: + rc = inode->i_sb->dq_op->get_projid(inode, &projid); + if (rc) + continue; + qid = make_kqid_projid(projid); + break; } got[cnt] = dqget(sb, qid); } @@ -2176,7 +2189,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EROFS; goto out_fmt; } - if (!sb->s_op->quota_write || !sb->s_op->quota_read) { + if (!sb->s_op->quota_write || !sb->s_op->quota_read || + (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; } diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h index f1966b42c2fd..4e95430093d9 100644 --- a/fs/quota/quotaio_v2.h +++ b/fs/quota/quotaio_v2.h @@ -13,12 +13,14 @@ */ #define V2_INITQMAGICS {\ 0xd9c01f11, /* USRQUOTA */\ - 0xd9c01927 /* GRPQUOTA */\ + 0xd9c01927, /* GRPQUOTA */\ + 0xd9c03f14, /* PRJQUOTA */\ } #define V2_INITQVERSIONS {\ 1, /* USRQUOTA */\ - 1 /* GRPQUOTA */\ + 1, /* GRPQUOTA */\ + 1, /* PRJQUOTA */\ } /* First generic header */ diff --git a/include/linux/quota.h b/include/linux/quota.h index cf910d1f8efa..b2505acfd3c0 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -50,6 +50,7 @@ #undef USRQUOTA #undef GRPQUOTA +#undef PRJQUOTA enum quota_type { USRQUOTA = 0, /* element used for user quotas */ GRPQUOTA = 1, /* element used for group quotas */ @@ -319,6 +320,7 @@ struct dquot_operations { /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); + int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */ }; struct path; diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h index 1f49b8341c99..9c95b2c1c88a 100644 --- a/include/uapi/linux/quota.h +++ b/include/uapi/linux/quota.h @@ -36,11 +36,12 @@ #include #include -#define __DQUOT_VERSION__ "dquot_6.5.2" +#define __DQUOT_VERSION__ "dquot_6.6.0" -#define MAXQUOTAS 2 +#define MAXQUOTAS 3 #define USRQUOTA 0 /* element used for user quotas */ #define GRPQUOTA 1 /* element used for group quotas */ +#define PRJQUOTA 2 /* element used for project quotas */ /* * Definitions for the default names of the quotas files. @@ -48,6 +49,7 @@ #define INITQFNAMES { \ "user", /* USRQUOTA */ \ "group", /* GRPQUOTA */ \ + "project", /* PRJQUOTA */ \ "undefined", \ }; -- cgit v1.2.3 From db24a9044ee191c397dcd1c6574f56d67d7c8df5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Mar 2015 20:23:15 -0600 Subject: net: add support for phys_port_name Similar to port id allow netdevices to specify port names and export the name via sysfs. Drivers can implement the netdevice operation to assist udev in having sane default names for the devices using the rule: $ cat /etc/udev/rules.d/80-net-setup-link.rules SUBSYSTEM=="net", ACTION=="add", ATTR{phys_port_name}!="", NAME="$attr{phys_port_name}" Use of phys_name versus phys_id was suggested-by Jiri Pirko. Signed-off-by: David Ahern Acked-by: Jiri Pirko Acked-by: Scott Feldman Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 8 ++++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/if_link.h | 1 + net/core/dev.c | 18 ++++++++++++++++++ net/core/net-sysfs.c | 23 +++++++++++++++++++++++ net/core/rtnetlink.c | 21 +++++++++++++++++++++ 6 files changed, 75 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index beb8ec4dabbc..5ecfd72ba684 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -188,6 +188,14 @@ Description: Indicates the interface unique physical port identifier within the NIC, as a string. +What: /sys/class/net//phys_port_name +Date: March 2015 +KernelVersion: 4.0 +Contact: netdev@vger.kernel.org +Description: + Indicates the interface physical port name within the NIC, + as a string. + What: /sys/class/net//speed Date: October 2009 KernelVersion: 2.6.33 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 76c5de4978a8..ec8f9b5f6500 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1164,6 +1164,8 @@ struct net_device_ops { bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); + int (*ndo_get_phys_port_name)(struct net_device *dev, + char *name, size_t len); void (*ndo_add_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); @@ -2947,6 +2949,8 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 756436e1ce89..7158fd00a109 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -147,6 +147,7 @@ enum { IFLA_CARRIER_CHANGES, IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 39fe369b46ad..a1f24151db5b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5911,6 +5911,24 @@ int dev_get_phys_port_id(struct net_device *dev, } EXPORT_SYMBOL(dev_get_phys_port_id); +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_name) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_name(dev, name, len); +} +EXPORT_SYMBOL(dev_get_phys_port_name); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7e58bd7ec232..cc5cf689809c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -418,6 +418,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sprintf(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -465,6 +487,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, NULL, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..6abe634c666c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -982,6 +982,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; @@ -1072,6 +1090,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_port_name_fill(skb, dev)) + goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; -- cgit v1.2.3 From af615762e972be0c66cf1d156ca4fac13b93c0b0 Mon Sep 17 00:00:00 2001 From: Jörg Thalheim Date: Wed, 18 Mar 2015 10:06:58 +0100 Subject: bridge: add ageing_time, stp_state, priority over netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jörg Thalheim Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ net/bridge/br_netlink.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7158fd00a109..f5f5edd5ae5f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -225,6 +225,9 @@ enum { IFLA_BR_FORWARD_DELAY, IFLA_BR_HELLO_TIME, IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8bc6b67457dc..e1115a224a95 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -733,6 +733,9 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, + [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, + [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, + [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -762,6 +765,24 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } + if (data[IFLA_BR_AGEING_TIME]) { + u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); + + br->ageing_time = clock_t_to_jiffies(ageing_time); + } + + if (data[IFLA_BR_STP_STATE]) { + u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); + + br_stp_set_enabled(br, stp_enabled); + } + + if (data[IFLA_BR_PRIORITY]) { + u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); + + br_stp_set_bridge_priority(br, priority); + } + return 0; } @@ -770,6 +791,9 @@ static size_t br_get_size(const struct net_device *brdev) return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ 0; } @@ -779,10 +803,16 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); + u32 ageing_time = jiffies_to_clock_t(br->ageing_time); + u32 stp_enabled = br->stp_enabled; + u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || - nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || + nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || + nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || + nla_put_u16(skb, IFLA_BR_PRIORITY, priority)) return -EMSGSIZE; return 0; -- cgit v1.2.3 From d6d2a1882a79c1a5425d6f82b2fc7b934916f893 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 30 Sep 2014 10:04:40 -0400 Subject: drm/radeon: add INFO query for GPU temperature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Useful for profiling. Tested-by: Marek Olšák Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_kms.c | 7 +++++++ include/uapi/drm/radeon_drm.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 686411e4e4f6..7904887c6cc7 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -547,6 +547,13 @@ static int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file else *value = 1; break; + case RADEON_INFO_CURRENT_GPU_TEMP: + /* get temperature in millidegrees C */ + if (rdev->asic->pm.get_temperature) + *value = radeon_get_temperature(rdev); + else + *value = 0; + break; default: DRM_DEBUG_KMS("Invalid request %d\n", info->request); return -EINVAL; diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb41a3bf..66b1131bb296 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -1034,6 +1034,7 @@ struct drm_radeon_cs { #define RADEON_INFO_VRAM_USAGE 0x1e #define RADEON_INFO_GTT_USAGE 0x1f #define RADEON_INFO_ACTIVE_CU_COUNT 0x20 +#define RADEON_INFO_CURRENT_GPU_TEMP 0x21 struct drm_radeon_info { uint32_t request; -- cgit v1.2.3 From 5c363a860398e2a09e5cff7f4654cf82ed8485e1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 30 Sep 2014 11:33:30 -0400 Subject: drm/radeon: add INFO query for current sclk/mclk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow the UMDs to query the current sclk/mclk for profiling, etc. Tested-by: Marek Olšák Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_kms.c | 14 ++++++++++++++ include/uapi/drm/radeon_drm.h | 2 ++ 2 files changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 7904887c6cc7..cf7e54e9b0d1 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -554,6 +554,20 @@ static int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file else *value = 0; break; + case RADEON_INFO_CURRENT_GPU_SCLK: + /* get sclk in Mhz */ + if (rdev->pm.dpm_enabled) + *value = radeon_dpm_get_current_sclk(rdev) / 100; + else + *value = rdev->pm.current_sclk / 100; + break; + case RADEON_INFO_CURRENT_GPU_MCLK: + /* get mclk in Mhz */ + if (rdev->pm.dpm_enabled) + *value = radeon_dpm_get_current_mclk(rdev) / 100; + else + *value = rdev->pm.current_mclk / 100; + break; default: DRM_DEBUG_KMS("Invalid request %d\n", info->request); return -EINVAL; diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 66b1131bb296..de7ee21efea2 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -1035,6 +1035,8 @@ struct drm_radeon_cs { #define RADEON_INFO_GTT_USAGE 0x1f #define RADEON_INFO_ACTIVE_CU_COUNT 0x20 #define RADEON_INFO_CURRENT_GPU_TEMP 0x21 +#define RADEON_INFO_CURRENT_GPU_SCLK 0x22 +#define RADEON_INFO_CURRENT_GPU_MCLK 0x23 struct drm_radeon_info { uint32_t request; -- cgit v1.2.3 From 4535cb9cefbc736b47bc1e7f8628065aba5ca0d7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 1 Oct 2014 11:26:50 -0400 Subject: drm/radeon: add support for read reg query from radeon info ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows us to query certain registers from userspace for profiling and harvest configuration. E.g., it can be used by the GALLIUM_HUD for profiling the status of various gfx blocks. Tested-by: Marek Olšák Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_kms.c | 8 ++++++++ include/uapi/drm/radeon_drm.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index cf7e54e9b0d1..7b2a7335cc5d 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -568,6 +568,14 @@ static int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file else *value = rdev->pm.current_mclk / 100; break; + case RADEON_INFO_READ_REG: + if (copy_from_user(value, value_ptr, sizeof(uint32_t))) { + DRM_ERROR("copy_from_user %s:%u\n", __func__, __LINE__); + return -EFAULT; + } + if (radeon_get_allowed_info_register(rdev, *value, value)) + return -EINVAL; + break; default: DRM_DEBUG_KMS("Invalid request %d\n", info->request); return -EINVAL; diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index de7ee21efea2..871e73f99a4d 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -1037,6 +1037,7 @@ struct drm_radeon_cs { #define RADEON_INFO_CURRENT_GPU_TEMP 0x21 #define RADEON_INFO_CURRENT_GPU_SCLK 0x22 #define RADEON_INFO_CURRENT_GPU_MCLK 0x23 +#define RADEON_INFO_READ_REG 0x24 struct drm_radeon_info { uint32_t request; -- cgit v1.2.3 From a736775db683174269c65c7c5cc8e5ee534e7681 Mon Sep 17 00:00:00 2001 From: Charlie Mooney Date: Fri, 20 Mar 2015 09:40:17 -0700 Subject: Input: add MT_TOOL_PALM Currently there are only two "tools" that can be specified by a multi-touch driver: MT_TOOL_FINGER and MT_TOOL_PEN. In working with Elan (The touch vendor) and discussing their next-gen devices it seems that it will be useful to have more tools so that their devices can give the upper layers of the stack hints as to what is touching the sensor. In particular they have new experimental firmware that can better differentiate between palms vs fingertips and would like to plumb a patch so that we can use their hints in higher-level gesture soft- ware. The firmware on the device can reasonably do a better job of palm detection because it has access to all of the raw sensor readings as opposed to just the width/pressure/etc that are exposed by the driver. As such, the firmware can characterize what a palm looks like in much finer-grained detail and this change would allow such a device to share its findings with the kernel. Signed-off-by: Charlie Mooney Acked-by: Peter Hutterer Signed-off-by: Dmitry Torokhov --- Documentation/input/multi-touch-protocol.txt | 9 ++++++--- include/uapi/linux/input.h | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt index 7b4f59c09ee2..b85d000faeb4 100644 --- a/Documentation/input/multi-touch-protocol.txt +++ b/Documentation/input/multi-touch-protocol.txt @@ -312,9 +312,12 @@ ABS_MT_TOOL_TYPE The type of approaching tool. A lot of kernel drivers cannot distinguish between different tool types, such as a finger or a pen. In such cases, the -event should be omitted. The protocol currently supports MT_TOOL_FINGER and -MT_TOOL_PEN [2]. For type B devices, this event is handled by input core; -drivers should instead use input_mt_report_slot_state(). +event should be omitted. The protocol currently supports MT_TOOL_FINGER, +MT_TOOL_PEN, and MT_TOOL_PALM [2]. For type B devices, this event is handled +by input core; drivers should instead use input_mt_report_slot_state(). +A contact's ABS_MT_TOOL_TYPE may change over time while still touching the +device, because the firmware may not be able to determine which tool is being +used when it first appears. ABS_MT_BLOB_ID diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index a1d7e931ab72..2320b0ce7579 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -972,7 +972,8 @@ struct input_keymap_entry { */ #define MT_TOOL_FINGER 0 #define MT_TOOL_PEN 1 -#define MT_TOOL_MAX 1 +#define MT_TOOL_PALM 2 +#define MT_TOOL_MAX 2 /* * Values describing the status of a force-feedback effect -- cgit v1.2.3 From 94caee8c312d96522bcdae88791aaa9ebcd5f22c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 20 Mar 2015 15:11:11 +0100 Subject: ebpf: add sched_act_type and map it to sk_filter's verifier ops In order to prepare eBPF support for tc action, we need to add sched_act_type, so that the eBPF verifier is aware of what helper function act_bpf may use, that it can load skb data and read out currently available skb fields. This is bascially analogous to 96be4325f443 ("ebpf: add sched_cls_type and map it to sk_filter's verifier ops"). BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_SCHED_ACT need to be separate since both will have a different set of functionality in future (classifier vs action), thus we won't run into ABI troubles when the point in time comes to diverge functionality from the classifier. The future plan for act_bpf would be that it will be able to write into skb->data and alter selected fields mirrored in struct __sk_buff. For an initial support, it's sufficient to map it to sk_filter_ops. Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Reviewed-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 1 + net/core/filter.c | 6 ++++++ 3 files changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1623047af463..3dd314a45d0d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c22ebd36fa4b..0e714f799ec0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1180,6 +1180,7 @@ static bool may_access_skb(enum bpf_prog_type type) switch (type) { case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: return true; default: return false; diff --git a/net/core/filter.c b/net/core/filter.c index bdaac5895def..084eacc4d1d4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1263,10 +1263,16 @@ static struct bpf_prog_type_list sched_cls_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_CLS, }; +static struct bpf_prog_type_list sched_act_type __read_mostly = { + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); + bpf_register_prog_type(&sched_act_type); return 0; } -- cgit v1.2.3 From a8cb5f556b567974d75ea29c15181c445c541b1f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 20 Mar 2015 15:11:12 +0100 Subject: act_bpf: add initial eBPF support for actions This work extends the "classic" BPF programmable tc action by extending its scope also to native eBPF code! Together with commit e2e9b6541dd4 ("cls_bpf: add initial eBPF support for programmable classifiers") this adds the facility to implement fully flexible classifier and actions for tc that can be implemented in a C subset in user space, "safely" loaded into the kernel, and being run in native speed when JITed. Also, since eBPF maps can be shared between eBPF programs, it offers the possibility that cls_bpf and act_bpf can share data 1) between themselves and 2) between user space applications. That means that, f.e. customized runtime statistics can be collected in user space, but also more importantly classifier and action behaviour could be altered based on map input from the user space application. For the remaining details on the workflow and integration, see the cls_bpf commit e2e9b6541dd4. Preliminary iproute2 part can be found under [1]. [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf-act Signed-off-by: Daniel Borkmann Cc: Jamal Hadi Salim Cc: Jiri Pirko Acked-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tc_act/tc_bpf.h | 6 +- include/uapi/linux/tc_act/tc_bpf.h | 2 + net/sched/act_bpf.c | 295 ++++++++++++++++++++++++++----------- 3 files changed, 220 insertions(+), 83 deletions(-) (limited to 'include/uapi') diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h index 86a070ffc930..a152e9858b2c 100644 --- a/include/net/tc_act/tc_bpf.h +++ b/include/net/tc_act/tc_bpf.h @@ -16,8 +16,12 @@ struct tcf_bpf { struct tcf_common common; struct bpf_prog *filter; + union { + u32 bpf_fd; + u16 bpf_num_ops; + }; struct sock_filter *bpf_ops; - u16 bpf_num_ops; + const char *bpf_name; }; #define to_bpf(a) \ container_of(a->priv, struct tcf_bpf, common) diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 5288bd77e63b..07f17cc70bb3 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -24,6 +24,8 @@ enum { TCA_ACT_BPF_PARMS, TCA_ACT_BPF_OPS_LEN, TCA_ACT_BPF_OPS, + TCA_ACT_BPF_FD, + TCA_ACT_BPF_NAME, __TCA_ACT_BPF_MAX, }; #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5f6288fa3f12..4d2cede17468 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -13,26 +13,40 @@ #include #include #include +#include + #include #include #include #include -#define BPF_TAB_MASK 15 +#define BPF_TAB_MASK 15 +#define ACT_BPF_NAME_LEN 256 + +struct tcf_bpf_cfg { + struct bpf_prog *filter; + struct sock_filter *bpf_ops; + char *bpf_name; + u32 bpf_fd; + u16 bpf_num_ops; +}; -static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, +static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { - struct tcf_bpf *b = a->priv; + struct tcf_bpf *prog = act->priv; int action, filter_res; - spin_lock(&b->tcf_lock); + spin_lock(&prog->tcf_lock); - b->tcf_tm.lastuse = jiffies; - bstats_update(&b->tcf_bstats, skb); + prog->tcf_tm.lastuse = jiffies; + bstats_update(&prog->tcf_bstats, skb); - filter_res = BPF_PROG_RUN(b->filter, skb); + /* Needed here for accessing maps. */ + rcu_read_lock(); + filter_res = BPF_PROG_RUN(prog->filter, skb); + rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the @@ -52,52 +66,87 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, break; case TC_ACT_SHOT: action = filter_res; - b->tcf_qstats.drops++; + prog->tcf_qstats.drops++; break; case TC_ACT_UNSPEC: - action = b->tcf_action; + action = prog->tcf_action; break; default: action = TC_ACT_UNSPEC; break; } - spin_unlock(&b->tcf_lock); + spin_unlock(&prog->tcf_lock); return action; } -static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *a, +static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog) +{ + return !prog->bpf_ops; +} + +static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + +static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); - struct tcf_bpf *b = a->priv; + struct tcf_bpf *prog = act->priv; struct tc_act_bpf opt = { - .index = b->tcf_index, - .refcnt = b->tcf_refcnt - ref, - .bindcnt = b->tcf_bindcnt - bind, - .action = b->tcf_action, + .index = prog->tcf_index, + .refcnt = prog->tcf_refcnt - ref, + .bindcnt = prog->tcf_bindcnt - bind, + .action = prog->tcf_action, }; - struct tcf_t t; - struct nlattr *nla; + struct tcf_t tm; + int ret; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, b->bpf_num_ops)) - goto nla_put_failure; - - nla = nla_reserve(skb, TCA_ACT_BPF_OPS, b->bpf_num_ops * - sizeof(struct sock_filter)); - if (!nla) + if (tcf_bpf_is_ebpf(prog)) + ret = tcf_bpf_dump_ebpf_info(prog, skb); + else + ret = tcf_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), b->bpf_ops, nla_len(nla)); + tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); + tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - t.install = jiffies_to_clock_t(jiffies - b->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - b->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(b->tcf_tm.expires); - if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(t), &t)) + if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm)) goto nla_put_failure; + return skb->len; nla_put_failure: @@ -107,36 +156,21 @@ nla_put_failure: static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, + [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, + [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; -static int tcf_bpf_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, - int ovr, int bind) +static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { - struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; - struct tc_act_bpf *parm; - struct tcf_bpf *b; - u16 bpf_size, bpf_num_ops; struct sock_filter *bpf_ops; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; + u16 bpf_size, bpf_num_ops; int ret; - if (!nla) - return -EINVAL; - - ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); - if (ret < 0) - return ret; - - if (!tb[TCA_ACT_BPF_PARMS] || - !tb[TCA_ACT_BPF_OPS_LEN] || !tb[TCA_ACT_BPF_OPS]) - return -EINVAL; - parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; @@ -146,68 +180,165 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (!bpf_ops) + if (bpf_ops == NULL) return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto free_bpf_ops; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*b), bind); - if (ret) + cfg->bpf_ops = bpf_ops; + cfg->bpf_num_ops = bpf_num_ops; + cfg->filter = fp; + + return 0; +} + +static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_ACT_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), + nla_len(tb[TCA_ACT_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + cfg->bpf_fd = bpf_fd; + cfg->bpf_name = name; + cfg->filter = fp; + + return 0; +} + +static int tcf_bpf_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *act, + int replace, int bind) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + struct tc_act_bpf *parm; + struct tcf_bpf *prog; + struct tcf_bpf_cfg cfg; + bool is_bpf, is_ebpf; + int ret; + + if (!nla) + return -EINVAL; + + ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); + if (ret < 0) + return ret; + + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; + is_ebpf = tb[TCA_ACT_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_ACT_BPF_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_ACT_BPF_PARMS]); + + memset(&cfg, 0, sizeof(cfg)); + + ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : + tcf_bpf_init_from_efd(tb, &cfg); + if (ret < 0) + return ret; + + if (!tcf_hash_check(parm->index, act, bind)) { + ret = tcf_hash_create(parm->index, est, act, + sizeof(*prog), bind); + if (ret < 0) goto destroy_fp; ret = ACT_P_CREATED; } else { + /* Don't override defaults. */ if (bind) goto destroy_fp; - tcf_hash_release(a, bind); - if (!ovr) { + + tcf_hash_release(act, bind); + if (!replace) { ret = -EEXIST; goto destroy_fp; } } - b = to_bpf(a); - spin_lock_bh(&b->tcf_lock); - b->tcf_action = parm->action; - b->bpf_num_ops = bpf_num_ops; - b->bpf_ops = bpf_ops; - b->filter = fp; - spin_unlock_bh(&b->tcf_lock); + prog = to_bpf(act); + spin_lock_bh(&prog->tcf_lock); + + prog->bpf_ops = cfg.bpf_ops; + prog->bpf_name = cfg.bpf_name; + + if (cfg.bpf_num_ops) + prog->bpf_num_ops = cfg.bpf_num_ops; + if (cfg.bpf_fd) + prog->bpf_fd = cfg.bpf_fd; + + prog->tcf_action = parm->action; + prog->filter = cfg.filter; + + spin_unlock_bh(&prog->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(act); + return ret; destroy_fp: - bpf_prog_destroy(fp); -free_bpf_ops: - kfree(bpf_ops); + if (is_ebpf) + bpf_prog_put(cfg.filter); + else + bpf_prog_destroy(cfg.filter); + + kfree(cfg.bpf_ops); + kfree(cfg.bpf_name); + return ret; } -static void tcf_bpf_cleanup(struct tc_action *a, int bind) +static void tcf_bpf_cleanup(struct tc_action *act, int bind) { - struct tcf_bpf *b = a->priv; + const struct tcf_bpf *prog = act->priv; - bpf_prog_destroy(b->filter); + if (tcf_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else + bpf_prog_destroy(prog->filter); } -static struct tc_action_ops act_bpf_ops = { - .kind = "bpf", - .type = TCA_ACT_BPF, - .owner = THIS_MODULE, - .act = tcf_bpf, - .dump = tcf_bpf_dump, - .cleanup = tcf_bpf_cleanup, - .init = tcf_bpf_init, +static struct tc_action_ops act_bpf_ops __read_mostly = { + .kind = "bpf", + .type = TCA_ACT_BPF, + .owner = THIS_MODULE, + .act = tcf_bpf, + .dump = tcf_bpf_dump, + .cleanup = tcf_bpf_cleanup, + .init = tcf_bpf_init, }; static int __init bpf_init_module(void) -- cgit v1.2.3 From 8da86466b83787df0d4b89ec81c310de072d101c Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki/吉藤英明 Date: Thu, 19 Mar 2015 22:41:46 +0900 Subject: net: neighbour: Add mcast_resolicit to configure the number of multicast resolicitations in PROBE state. We send unicast neighbor (ARP or NDP) solicitations ucast_probes times in PROBE state. Zhu Yanjun reported that some implementation does not reply against them and the entry will become FAILED, which is undesirable. We had been dealt with such nodes by sending multicast probes mcast_ solicit times after unicast probes in PROBE state. In 2003, I made a change not to send them to improve compatibility with IPv6 NDP. Let's introduce per-protocol per-interface sysctl knob "mcast_ reprobe" to configure the number of multicast (re)solicitation for reconfirmation in PROBE state. The default is 0, since we have been doing so for 10+ years. Reported-by: Zhu Yanjun CC: Ulf Samuelsson Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 15 +++++++++++---- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e7bdf5170802..bd33e66f49aa 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -42,6 +42,7 @@ enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, + NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 3873a35509aa..2e35c61bbdd1 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -126,6 +126,7 @@ enum { NDTPA_PROXY_QLEN, /* u32 */ NDTPA_LOCKTIME, /* u64, msecs */ NDTPA_QUEUE_LENBYTES, /* u32 */ + NDTPA_MCAST_REPROBES, /* u32 */ __NDTPA_MAX }; #define NDTPA_MAX (__NDTPA_MAX - 1) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 0e8b32efc031..3de654256028 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -817,10 +817,9 @@ out: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); - if (!(n->nud_state & NUD_PROBE)) - max_probes += NEIGH_VAR(p, MCAST_PROBES); - return max_probes; + return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : + NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) @@ -1742,6 +1741,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_REPROBES, + NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || @@ -1901,6 +1902,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, @@ -2001,6 +2003,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; + case NDTPA_MCAST_REPROBES: + NEIGH_VAR_SET(p, MCAST_REPROBES, + nla_get_u32(tbp[i])); + break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); @@ -2987,6 +2993,7 @@ static struct neigh_sysctl_table { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), -- cgit v1.2.3 From 682f048bd49449f4ab978664a7f69a44a74e3caa Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Mon, 23 Mar 2015 09:11:13 +0300 Subject: af_packet: pass checksum validation status to the user Introduce TP_STATUS_CSUM_VALID tp_status flag to tell the af_packet user that at least the transport header checksum has been already validated. For now, the flag may be set for incoming packets only. Signed-off-by: Alexander Drozdov Cc: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/packet_mmap.txt | 13 ++++++++++--- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 9 +++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index a6d7cb91069e..daa015af16a0 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -440,9 +440,10 @@ and the following flags apply: +++ Capture process: from include/linux/if_packet.h - #define TP_STATUS_COPY 2 - #define TP_STATUS_LOSING 4 - #define TP_STATUS_CSUMNOTREADY 8 + #define TP_STATUS_COPY (1 << 1) + #define TP_STATUS_LOSING (1 << 2) + #define TP_STATUS_CSUMNOTREADY (1 << 3) + #define TP_STATUS_CSUM_VALID (1 << 7) TP_STATUS_COPY : This flag indicates that the frame (and associated meta information) has been truncated because it's @@ -466,6 +467,12 @@ TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which reading the packet we should not try to check the checksum. +TP_STATUS_CSUM_VALID : This flag indicates that at least the transport + header checksum of the packet has been already + validated on the kernel side. If the flag is not set + then we are free to check the checksum by ourselves + provided that TP_STATUS_CSUMNOTREADY is also not set. + for convenience there are also the following defines: #define TP_STATUS_KERNEL 0 diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index da2d668b8cf1..053bd102fbe0 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -99,6 +99,7 @@ struct tpacket_auxdata { #define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ #define TP_STATUS_BLK_TMO (1 << 5) #define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ +#define TP_STATUS_CSUM_VALID (1 << 7) /* Tx ring - header status */ #define TP_STATUS_AVAILABLE 0 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9d854c5ce0b5..5102c3cc4eec 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1924,6 +1924,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + status |= TP_STATUS_CSUM_VALID; if (snaplen > res) snaplen = res; @@ -3031,6 +3035,11 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + aux.tp_status |= TP_STATUS_CSUM_VALID; + aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; -- cgit v1.2.3 From 3d1bec99320d4e96897805440f8cf4f68eff226b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 23:36:00 +0100 Subject: ipv6: introduce secret_stable to ipv6_devconf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the procfs logic for the stable_address knob: The secret is formatted as an ipv6 address and will be stored per interface and per namespace. We track initialized flag and return EIO errors until the secret is set. We don't inherit the secret to newly created namespaces. Cc: Erik Kline Cc: Fernando Gont Cc: Lorenzo Colitti Cc: YOSHIFUJI Hideaki/吉藤英明 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/ipv6.h | 4 +++ include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 4d5169f5d7d1..82806c60aa42 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -53,6 +53,10 @@ struct ipv6_devconf { __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; + struct ipv6_stable_secret { + bool initialized; + struct in6_addr secret; + } stable_secret; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 437a6a4b125a..5efa54ae567c 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -170,6 +170,7 @@ enum { DEVCONF_ACCEPT_RA_FROM_LOCAL, DEVCONF_USE_OPTIMISTIC, DEVCONF_ACCEPT_RA_MTU, + DEVCONF_STABLE_SECRET, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 158378e73f0a..5b967c8a617a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,9 @@ #define INFINITY_LIFE_TIME 0xFFFFFFFF +#define IPV6_MAX_STRLEN \ + sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") + static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; @@ -202,6 +206,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_dad = 1, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, + .stable_secret = { + .initialized = false, + } }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -240,6 +247,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_dad = 1, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, + .stable_secret = { + .initialized = false, + }, }; /* Check if a valid qdisc is available */ @@ -4430,6 +4440,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + /* we omit DEVCONF_STABLE_SECRET for now */ } static inline size_t inet6_ifla6_size(void) @@ -5074,6 +5085,53 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, return ret; } +static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + struct in6_addr addr; + char str[IPV6_MAX_STRLEN]; + struct ctl_table lctl = *ctl; + struct ipv6_stable_secret *secret = ctl->data; + + lctl.maxlen = IPV6_MAX_STRLEN; + lctl.data = str; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (!write && !secret->initialized) { + err = -EIO; + goto out; + } + + if (!write) { + err = snprintf(str, sizeof(str), "%pI6", + &secret->secret); + if (err >= sizeof(str)) { + err = -EIO; + goto out; + } + } + + err = proc_dostring(&lctl, write, buffer, lenp, ppos); + if (err || !write) + goto out; + + if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) { + err = -EIO; + goto out; + } + + secret->initialized = true; + secret->secret = addr; + +out: + rtnl_unlock(); + + return err; +} static struct addrconf_sysctl_table { @@ -5346,6 +5404,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "stable_secret", + .data = &ipv6_devconf.stable_secret, + .maxlen = IPV6_MAX_STRLEN, + .mode = 0600, + .proc_handler = addrconf_sysctl_stable_secret, + }, { /* sentinel */ } @@ -5442,6 +5507,9 @@ static int __net_init addrconf_init_net(struct net *net) dflt->autoconf = ipv6_defaults.autoconf; dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; + dflt->stable_secret.initialized = false; + all->stable_secret.initialized = false; + net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; -- cgit v1.2.3 From 622c81d57b392cc9be836670eb464a4dfaa9adfe Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 23:36:01 +0100 Subject: ipv6: generation of stable privacy addresses for link-local and autoconf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the stable privacy address generation for link-local and autoconf addresses as specified in RFC7217. RID = F(Prefix, Net_Iface, Network_ID, DAD_Counter, secret_key) is the RID (random identifier). As the hash function F we chose one round of sha1. Prefix will be either the link-local prefix or the router advertised one. As Net_Iface we use the MAC address of the device. DAD_Counter and secret_key are implemented as specified. We don't use Network_ID, as it couples the code too closely to other subsystems. It is specified as optional in the RFC. As Net_Iface we only use the MAC address: we simply have no stable identifier in the kernel we could possibly use: because this code might run very early, we cannot depend on names, as they might be changed by user space early on during the boot process. A new address generation mode is introduced, IN6_ADDR_GEN_MODE_STABLE_PRIVACY. With iproute2 one can switch back to none or eui64 address configuration mode although the stable_secret is already set. We refuse writes to ipv6/conf/all/stable_secret but only allow ipv6/conf/default/stable_secret and the interface specific file to be written to. The default stable_secret is used as the parameter for the namespace, the interface specific can overwrite the secret, e.g. when switching a network configuration from one system to another while inheriting the secret. Cc: Erik Kline Cc: Fernando Gont Cc: Lorenzo Colitti Cc: YOSHIFUJI Hideaki/吉藤英明 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/ipv6/addrconf.c | 130 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 127 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f5f5edd5ae5f..7ffb18df01ca 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -216,6 +216,7 @@ enum { enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_EUI64, IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, }; /* Bridge section */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5b967c8a617a..6813268ce8b8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -131,6 +131,9 @@ static void ipv6_regen_rndid(unsigned long data); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_generate_stable_address(struct in6_addr *addr, + u8 dad_count, + const struct inet6_dev *idev); /* * Configured unicast address hash table @@ -2302,6 +2305,11 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev->token.s6_addr + 8, 8); read_unlock_bh(&in6_dev->lock); tokenized = true; + } else if (in6_dev->addr_gen_mode == + IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !ipv6_generate_stable_address(&addr, 0, + in6_dev)) { + goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); @@ -2820,12 +2828,98 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static bool ipv6_reserved_interfaceid(struct in6_addr address) +{ + if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0) + return true; + + if (address.s6_addr32[2] == htonl(0x02005eff) && + ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000))) + return true; + + if (address.s6_addr32[2] == htonl(0xfdffffff) && + ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80))) + return true; + + return false; +} + +static int ipv6_generate_stable_address(struct in6_addr *address, + u8 dad_count, + const struct inet6_dev *idev) +{ + static const int idgen_retries = 3; + + static DEFINE_SPINLOCK(lock); + static __u32 digest[SHA_DIGEST_WORDS]; + static __u32 workspace[SHA_WORKSPACE_WORDS]; + + static union { + char __data[SHA_MESSAGE_BYTES]; + struct { + struct in6_addr secret; + __be64 prefix; + unsigned char hwaddr[MAX_ADDR_LEN]; + u8 dad_count; + } __packed; + } data; + + struct in6_addr secret; + struct in6_addr temp; + struct net *net = dev_net(idev->dev); + + BUILD_BUG_ON(sizeof(data.__data) != sizeof(data)); + + if (idev->cnf.stable_secret.initialized) + secret = idev->cnf.stable_secret.secret; + else if (net->ipv6.devconf_dflt->stable_secret.initialized) + secret = net->ipv6.devconf_dflt->stable_secret.secret; + else + return -1; + +retry: + spin_lock_bh(&lock); + + sha_init(digest); + memset(&data, 0, sizeof(data)); + memset(workspace, 0, sizeof(workspace)); + memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); + data.prefix = ((__be64)address->s6_addr32[0] << 32) | + (__be64)address->s6_addr32[1]; + data.secret = secret; + data.dad_count = dad_count; + + sha_transform(digest, data.__data, workspace); + + temp = *address; + temp.s6_addr32[2] = digest[0]; + temp.s6_addr32[3] = digest[1]; + + spin_unlock_bh(&lock); + + if (ipv6_reserved_interfaceid(temp)) { + dad_count++; + if (dad_count > idgen_retries) + return -1; + goto retry; + } + + *address = temp; + return 0; +} + static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { - if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { - struct in6_addr addr; + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { + if (!ipv6_generate_stable_address(&addr, 0, idev)) + addrconf_add_linklocal(idev, &addr); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { /* addrconf_add_linklocal also adds a prefix_route and we * only need to care about prefix routes if ipv6_generate_eui64 * couldn't generate one. @@ -4675,8 +4769,15 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); if (mode != IN6_ADDR_GEN_MODE_EUI64 && - mode != IN6_ADDR_GEN_MODE_NONE) + mode != IN6_ADDR_GEN_MODE_NONE && + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY) + return -EINVAL; + + if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !idev->cnf.stable_secret.initialized && + !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized) return -EINVAL; + idev->addr_gen_mode = mode; err = 0; } @@ -5093,8 +5194,12 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, struct in6_addr addr; char str[IPV6_MAX_STRLEN]; struct ctl_table lctl = *ctl; + struct net *net = ctl->extra2; struct ipv6_stable_secret *secret = ctl->data; + if (&net->ipv6.devconf_all->stable_secret == ctl->data) + return -EIO; + lctl.maxlen = IPV6_MAX_STRLEN; lctl.data = str; @@ -5127,6 +5232,23 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, secret->initialized = true; secret->secret = addr; + if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) { + struct net_device *dev; + + for_each_netdev(net, dev) { + struct inet6_dev *idev = __in6_dev_get(dev); + + if (idev) { + idev->addr_gen_mode = + IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + } + } + } else { + struct inet6_dev *idev = ctl->extra1; + + idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + } + out: rtnl_unlock(); -- cgit v1.2.3 From 64236f3f3d742469e4027b83a9515e84e9ab21b4 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 23:36:02 +0100 Subject: ipv6: introduce IFA_F_STABLE_PRIVACY flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to mark appropriate addresses so we can do retries in case their DAD failed. Cc: Erik Kline Cc: Fernando Gont Cc: Lorenzo Colitti Cc: YOSHIFUJI Hideaki/吉藤英明 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/if_addr.h | 1 + net/ipv6/addrconf.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 40fdfea39714..4318ab1635ce 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -51,6 +51,7 @@ enum { #define IFA_F_MANAGETEMPADDR 0x100 #define IFA_F_NOPREFIXROUTE 0x200 #define IFA_F_MCAUTOJOIN 0x400 +#define IFA_F_STABLE_PRIVACY 0x800 struct ifa_cacheinfo { __u32 ifa_prefered; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6813268ce8b8..c2357b6f62dd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2199,6 +2199,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) __u32 valid_lft; __u32 prefered_lft; int addr_type; + u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); @@ -2309,6 +2310,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) IN6_ADDR_GEN_MODE_STABLE_PRIVACY && !ipv6_generate_stable_address(&addr, 0, in6_dev)) { + addr_flags |= IFA_F_STABLE_PRIVACY; goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { @@ -2328,7 +2330,6 @@ ok: if (ifp == NULL && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; - u32 addr_flags = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && @@ -2807,10 +2808,11 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) +static void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; - u32 addr_flags = IFA_F_PERMANENT; + u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (idev->cnf.optimistic_dad && @@ -2818,7 +2820,6 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr addr_flags |= IFA_F_OPTIMISTIC; #endif - ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { @@ -2916,7 +2917,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { if (!ipv6_generate_stable_address(&addr, 0, idev)) - addrconf_add_linklocal(idev, &addr); + addrconf_add_linklocal(idev, &addr, + IFA_F_STABLE_PRIVACY); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { @@ -2925,7 +2927,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) * couldn't generate one. */ if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); } -- cgit v1.2.3 From 27cd5452476978283decb19e429e81fc6c71e74b Mon Sep 17 00:00:00 2001 From: Michal Sekletar Date: Tue, 24 Mar 2015 14:48:41 +0100 Subject: filter: introduce SKF_AD_VLAN_TPID BPF extension If vlan offloading takes place then vlan header is removed from frame and its contents, both vlan_tci and vlan_proto, is available to user space via TPACKET interface. However, only vlan_tci can be used in BPF filters. This commit introduces a new BPF extension. It makes possible to load the value of vlan_proto (vlan TPID) to register A. Support for classic BPF and eBPF is being added, analogous to skb->protocol. Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Jiri Pirko Signed-off-by: Michal Sekletar Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 3 ++- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/filter.h | 3 ++- net/core/filter.c | 17 +++++++++++++++++ tools/net/bpf_exp.l | 2 ++ tools/net/bpf_exp.y | 11 ++++++++++- 7 files changed, 35 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 9930ecfbb465..135581f015e1 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -280,7 +280,8 @@ Possible BPF extensions are shown in the following table: rxhash skb->hash cpu raw_smp_processor_id() vlan_tci skb_vlan_tag_get(skb) - vlan_pr skb_vlan_tag_present(skb) + vlan_avail skb_vlan_tag_present(skb) + vlan_tpid skb->vlan_proto rand prandom_u32() These extensions can also be prefixed with '#'. diff --git a/include/linux/filter.h b/include/linux/filter.h index 9ee8c67ea249..fa11b3a367be 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -454,6 +454,7 @@ static inline u16 bpf_anc_helper(const struct sock_filter *ftest) BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); + BPF_ANCILLARY(VLAN_TPID); } /* Fallthrough. */ default: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3dd314a45d0d..27dc4ec58840 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -182,6 +182,7 @@ struct __sk_buff { __u32 protocol; __u32 vlan_present; __u32 vlan_tci; + __u32 vlan_proto; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h index 47785d5ecf17..34c7936ca114 100644 --- a/include/uapi/linux/filter.h +++ b/include/uapi/linux/filter.h @@ -77,7 +77,8 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ #define SKF_AD_VLAN_TAG_PRESENT 48 #define SKF_AD_PAY_OFFSET 52 #define SKF_AD_RANDOM 56 -#define SKF_AD_MAX 60 +#define SKF_AD_VLAN_TPID 60 +#define SKF_AD_MAX 64 #define SKF_NET_OFF (-0x100000) #define SKF_LL_OFF (-0x200000) diff --git a/net/core/filter.c b/net/core/filter.c index 084eacc4d1d4..32f43c59908c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -272,6 +272,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, insn += cnt - 1; break; + case SKF_AD_OFF + SKF_AD_VLAN_TPID: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + + /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, vlan_proto)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: @@ -1226,6 +1236,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, offsetof(struct sk_buff, protocol)); break; + case offsetof(struct __sk_buff, vlan_proto): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_proto)); + break; + case offsetof(struct __sk_buff, mark): return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l index 833a96611da6..c83af3fb77de 100644 --- a/tools/net/bpf_exp.l +++ b/tools/net/bpf_exp.l @@ -92,6 +92,8 @@ extern void yyerror(const char *str); "#"?("cpu") { return K_CPU; } "#"?("vlan_tci") { return K_VLANT; } "#"?("vlan_pr") { return K_VLANP; } +"#"?("vlan_avail") { return K_VLANP; } +"#"?("vlan_tpid") { return K_VLANTPID; } "#"?("rand") { return K_RAND; } ":" { return ':'; } diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y index e6306c51c26f..f8332749b44c 100644 --- a/tools/net/bpf_exp.y +++ b/tools/net/bpf_exp.y @@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type); %token OP_LDXI %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE -%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND +%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_VLANTPID K_POFF K_RAND %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%' @@ -167,6 +167,9 @@ ldb | OP_LDB K_RAND { bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_RANDOM); } + | OP_LDB K_VLANTPID { + bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0, + SKF_AD_OFF + SKF_AD_VLAN_TPID); } ; ldh @@ -218,6 +221,9 @@ ldh | OP_LDH K_RAND { bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_RANDOM); } + | OP_LDH K_VLANTPID { + bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0, + SKF_AD_OFF + SKF_AD_VLAN_TPID); } ; ldi @@ -274,6 +280,9 @@ ld | OP_LD K_RAND { bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_RANDOM); } + | OP_LD K_VLANTPID { + bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0, + SKF_AD_OFF + SKF_AD_VLAN_TPID); } | OP_LD 'M' '[' number ']' { bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); } | OP_LD '[' 'x' '+' number ']' { -- cgit v1.2.3 From 2c60fae1489c70206e66c28d72b69a3e496c313d Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 26 Mar 2015 21:47:16 +0200 Subject: drm/i915: fix definition of the DRM_IOCTL_I915_GET_SPRITE_COLORKEY ioctl Fix definition of the DRM_IOCTL_I915_GET_SPRITE_COLORKEY ioctl, so that it is different from the DRM_IOCTL_I915_SET_SPRITE_COLORKEY ioctl. Note that this is just for accuracy, the ioctl implementation itself is totally unused and already ripped out. Signed-off-by: Tommi Rantala [danvet: Add note that this is a dead ioctl.] Signed-off-by: Daniel Vetter --- include/uapi/drm/i915_drm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 8d1be9073380..551b6737f5df 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -270,7 +270,7 @@ typedef struct _drm_i915_sarea { #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE DRM_IOW(DRM_COMMAND_BASE + DRM_I915_OVERLAY_PUT_IMAGE, struct drm_intel_overlay_put_image) #define DRM_IOCTL_I915_OVERLAY_ATTRS DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs) #define DRM_IOCTL_I915_SET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_SET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey) -#define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_SET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey) +#define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey) #define DRM_IOCTL_I915_GEM_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait) #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create) #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy) -- cgit v1.2.3 From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Feb 2015 14:05:38 +0100 Subject: perf: Add per event clockid support While thinking on the whole clock discussion it occurred to me we have two distinct uses of time: 1) the tracking of event/ctx/cgroup enabled/running/stopped times which includes the self-monitoring support in struct perf_event_mmap_page. 2) the actual timestamps visible in the data records. And we've been conflating them. The first is all about tracking time deltas, nobody should really care in what time base that happens, its all relative information, as long as its internally consistent it works. The second however is what people are worried about when having to merge their data with external sources. And here we have the discussion on MONOTONIC vs MONOTONIC_RAW etc.. Where MONOTONIC is good for correlating between machines (static offset), MONOTNIC_RAW is required for correlating against a fixed rate hardware clock. This means configurability; now 1) makes that hard because it needs to be internally consistent across groups of unrelated events; which is why we had to have a global perf_clock(). However, for 2) it doesn't really matter, perf itself doesn't care what it writes into the buffer. The below patch makes the distinction between these two cases by adding perf_event_clock() which is used for the second case. It further makes this configurable on a per-event basis, but adds a few sanity checks such that we cannot combine events with different clocks in confusing ways. And since we then have per-event configurability we might as well retain the 'legacy' behaviour as a default. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++-- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 6 ++-- kernel/events/core.c | 77 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 91 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ac41b3ad1fc9..0420ebcac116 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event, data = cyc2ns_read_begin(); + /* + * Internal timekeeping for enabled/running/stopped times + * is always in the local_clock domain. + */ userpg->cap_user_time = 1; userpg->time_mult = data->cyc2ns_mul; userpg->time_shift = data->cyc2ns_shift; userpg->time_offset = data->cyc2ns_offset - now; - userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; + /* + * cap_user_time_zero doesn't make sense when we're using a different + * time base for the records. + */ + if (event->clock == &local_clock) { + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + } cyc2ns_read_end(data); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b16eac5f54ce..401554074de9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -173,6 +173,7 @@ struct perf_event; * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x01 +#define PERF_PMU_CAP_NO_NMI 0x02 /** * struct pmu - generic performance monitoring unit @@ -457,6 +458,7 @@ struct perf_event { struct pid_namespace *ns; u64 id; + u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1e3cd07cf76e..3bb40ddadbe5 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -326,7 +326,8 @@ struct perf_event_attr { exclude_callchain_user : 1, /* exclude user callchains */ mmap2 : 1, /* include mmap with inode data */ comm_exec : 1, /* flag comm events that are due to an exec */ - __reserved_1 : 39; + use_clockid : 1, /* use @clockid for time fields */ + __reserved_1 : 38; union { __u32 wakeup_events; /* wakeup every n events */ @@ -355,8 +356,7 @@ struct perf_event_attr { */ __u32 sample_stack_user; - /* Align to u64. */ - __u32 __reserved_2; + __s32 clockid; /* * Defines set of regs to dump for each sample * state captured on: diff --git a/kernel/events/core.c b/kernel/events/core.c index bb1a7c36e794..c40c2cac2d8e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -327,6 +327,11 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline u64 perf_event_clock(struct perf_event *event) +{ + return event->clock(); +} + static inline struct perf_cpu_context * __get_cpu_context(struct perf_event_context *ctx) { @@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); + data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); @@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); + task_event->event_id.time = perf_event_clock(event); + perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); @@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ - .time = perf_clock(), + /* .time */ }, }; @@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) .misc = 0, .size = sizeof(throttle_event), }, - .time = perf_clock(), + .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; @@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event) static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, @@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event) static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, @@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event) static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, @@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->hw.target = task; } + event->clock = &local_clock; + if (parent_event) + event->clock = parent_event->clock; + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; @@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; + /* + * Mixing clocks in the same buffer is trouble you don't need. + */ + if (output_event->clock != event->clock) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ @@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } +static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) +{ + bool nmi_safe = false; + + switch (clk_id) { + case CLOCK_MONOTONIC: + event->clock = &ktime_get_mono_fast_ns; + nmi_safe = true; + break; + + case CLOCK_MONOTONIC_RAW: + event->clock = &ktime_get_raw_fast_ns; + nmi_safe = true; + break; + + case CLOCK_REALTIME: + event->clock = &ktime_get_real_ns; + break; + + case CLOCK_BOOTTIME: + event->clock = &ktime_get_boot_ns; + break; + + case CLOCK_TAI: + event->clock = &ktime_get_tai_ns; + break; + + default: + return -EINVAL; + } + + if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) + return -EINVAL; + + return 0; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = event->pmu; + if (attr.use_clockid) { + err = perf_event_set_clock(event, attr.clockid); + if (err) + goto err_alloc; + } + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open, */ if (group_leader->group_leader != group_leader) goto err_context; + + /* All events in a group should have the same clock */ + if (group_leader->clock != event->clock) + goto err_context; + /* * Do not allow to attach to a group in a different * task or CPU context: -- cgit v1.2.3 From 5fafd8748b366105e08c198892e9fe02ef15c021 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 8 Dec 2014 23:07:56 +0000 Subject: MIPS: KVM: Wire up FPU capability Now that the code is in place for KVM to support FPU in MIPS KVM guests, wire up the new KVM_CAP_MIPS_FPU capability. For backwards compatibility, the capability must be explicitly enabled in order to detect or make use of the FPU from the guest. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Ralf Baechle Cc: Gleb Natapov Cc: Jonathan Corbet Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: linux-api@vger.kernel.org Cc: linux-doc@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 13 +++++++++++++ arch/mips/kvm/mips.c | 37 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 3 files changed, 51 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f3c198360785..a1e9bfa5fe9e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3312,6 +3312,19 @@ Parameters: none This capability enables the in-kernel irqchip for s390. Please refer to "4.24 KVM_CREATE_IRQCHIP" for details. +6.9 KVM_CAP_MIPS_FPU + +Architectures: mips +Target: vcpu +Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the host Floating Point Unit by the guest. It +allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is +done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed +(depending on the current guest FPU register mode), and the Status.FR, +Config5.FRE bits are accessible via the KVM API and also from the guest, +depending on them being supported by the FPU. + 7. Capabilities that can be enabled on VMs ------------------------------------------ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5e41afe15ae8..7f86cb73d05d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -797,6 +797,30 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + int r = 0; + + if (!kvm_vm_ioctl_check_extension(vcpu->kvm, cap->cap)) + return -EINVAL; + if (cap->flags) + return -EINVAL; + if (cap->args[0]) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_MIPS_FPU: + vcpu->arch.fpu_enabled = true; + break; + default: + r = -EINVAL; + break; + } + + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -854,6 +878,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } default: r = -ENOIOCTLCMD; } @@ -962,11 +995,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_ONE_REG: + case KVM_CAP_ENABLE_CAP: r = 1; break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; + case KVM_CAP_MIPS_FPU: + r = !!cpu_has_fpu; + break; default: r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 1162ef7a3fa1..ce49688976d2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -802,6 +802,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_MEM_OP 108 #define KVM_CAP_S390_USER_STSI 109 #define KVM_CAP_S390_SKEYS 110 +#define KVM_CAP_MIPS_FPU 111 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From d952bd070f79b6dcbad52c03dbc41cbc8ba086c8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 8 Dec 2014 23:07:56 +0000 Subject: MIPS: KVM: Wire up MSA capability Now that the code is in place for KVM to support MIPS SIMD Architecutre (MSA) in MIPS guests, wire up the new KVM_CAP_MIPS_MSA capability. For backwards compatibility, the capability must be explicitly enabled in order to detect or make use of MSA from the guest. The capability is not supported if the hardware supports MSA vector partitioning, since the extra support cannot be tested yet and it extends the state that the userland program would have to save. Signed-off-by: James Hogan Acked-by: Paolo Bonzini Cc: Ralf Baechle Cc: Gleb Natapov Cc: Jonathan Corbet Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: linux-api@vger.kernel.org Cc: linux-doc@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 12 ++++++++++++ arch/mips/kvm/mips.c | 18 ++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 3 files changed, 31 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 62809871814b..1490eb0ef798 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3335,6 +3335,18 @@ done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed Config5.FRE bits are accessible via the KVM API and also from the guest, depending on them being supported by the FPU. +6.10 KVM_CAP_MIPS_MSA + +Architectures: mips +Target: vcpu +Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. +It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. +Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be +accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from +the guest. + 7. Capabilities that can be enabled on VMs ------------------------------------------ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 35d3146895f1..bb68e8d520e8 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -880,6 +880,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_MIPS_FPU: vcpu->arch.fpu_enabled = true; break; + case KVM_CAP_MIPS_MSA: + vcpu->arch.msa_enabled = true; + break; default: r = -EINVAL; break; @@ -1071,6 +1074,21 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MIPS_FPU: r = !!cpu_has_fpu; break; + case KVM_CAP_MIPS_MSA: + /* + * We don't support MSA vector partitioning yet: + * 1) It would require explicit support which can't be tested + * yet due to lack of support in current hardware. + * 2) It extends the state that would need to be saved/restored + * by e.g. QEMU for migration. + * + * When vector partitioning hardware becomes available, support + * could be added by requiring a flag when enabling + * KVM_CAP_MIPS_MSA capability to indicate that userland knows + * to save/restore the appropriate extra state. + */ + r = cpu_has_msa && !(boot_cpu_data.msa_id & MSA_IR_WRPF); + break; default: r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ce49688976d2..05a2083f7a28 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -803,6 +803,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_STSI 109 #define KVM_CAP_S390_SKEYS 110 #define KVM_CAP_MIPS_FPU 111 +#define KVM_CAP_MIPS_MSA 112 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 608cd71a9c7c9db76e78a792c5a4101e12fea43f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 26 Mar 2015 19:53:57 -0700 Subject: tc: bpf: generalize pedit action existing TC action 'pedit' can munge any bits of the packet. Generalize it for use in bpf programs attached as cls_bpf and act_bpf via bpf_skb_store_bytes() helper function. Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 2 ++ net/core/filter.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 73 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 280a315de8d6..d5cda067115a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -59,6 +59,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ + ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 27dc4ec58840..74aab6e0d964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ + BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e714f799ec0..630a7bac1e51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -773,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; diff --git a/net/core/filter.c b/net/core/filter.c index 32f43c59908c..444a07e4f68d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,6 +1175,56 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + unsigned int offset = (unsigned int) r2; + void *from = (void *) (long) r3; + unsigned int len = (unsigned int) r4; + char buf[16]; + void *ptr; + + /* bpf verifier guarantees that: + * 'from' pointer points to bpf program stack + * 'len' bytes of it were initialized + * 'len' > 0 + * 'skb' is a valid pointer to 'struct sk_buff' + * + * so check for invalid 'offset' and too large 'len' + */ + if (offset > 0xffff || len > sizeof(buf)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, buf); + if (unlikely(!ptr)) + return -EFAULT; + + skb_postpull_rcsum(skb, ptr, len); + + memcpy(ptr, from, len); + + if (ptr == buf) + /* skb_store_bits cannot return -EFAULT here */ + skb_store_bits(skb, offset, ptr, len); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); + return 0; +} + +const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .func = bpf_skb_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1194,6 +1244,17 @@ sk_filter_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +tc_cls_act_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { @@ -1270,18 +1331,24 @@ static const struct bpf_verifier_ops sk_filter_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops tc_cls_act_ops = { + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &sk_filter_ops, + .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &sk_filter_ops, + .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_ACT, }; -- cgit v1.2.3 From 3a323d4e17dd5a84f6ad036e6f985d263ca973ed Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Wed, 18 Mar 2015 10:47:02 +0200 Subject: nl80211: small clarification of the sched_scan delay attribute Just clarify that the delay is only before the first cycle. Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ae16ba9cb1e3..241220c43e86 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1754,8 +1754,9 @@ enum nl80211_commands { * should be contained in the result as the sum of the respective counters * over all channels. * - * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before a scheduled scan (or a - * WoWLAN net-detect scan) is started, u32 in seconds. + * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before the first cycle of a + * scheduled scan (or a WoWLAN net-detect scan) is started, u32 + * in seconds. * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device * is operating in an indoor environment. -- cgit v1.2.3 From f3f03330dee0526d82f2a0fd1a79d207ed1ac439 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Mar 2015 18:46:29 +0200 Subject: nfsd: require an explicit option to enable pNFS Turns out sending out layouts to any client is a bad idea if they can't get at the storage device, so require explicit admin action to enable pNFS. Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4layouts.c | 2 +- include/uapi/linux/nfsd/export.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 80e236bf79fc..6904213a4363 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -118,7 +118,7 @@ void nfsd4_setup_layout_type(struct svc_export *exp) { struct super_block *sb = exp->ex_path.mnt->mnt_sb; - if (exp->ex_flags & NFSEXP_NOPNFS) + if (!(exp->ex_flags & NFSEXP_PNFS)) return; if (sb->s_export_op->get_uuid && diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index 4742f2cb42f2..d3bd6ffec041 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -47,7 +47,7 @@ * exported filesystem. */ #define NFSEXP_V4ROOT 0x10000 -#define NFSEXP_NOPNFS 0x20000 +#define NFSEXP_PNFS 0x20000 /* All flags that we claim to support. (Note we don't support NOACL.) */ #define NFSEXP_ALLFLAGS 0x3FE7F -- cgit v1.2.3 From 4b3a81a917a5ef21a4483d699cefd4d9fa35b841 Mon Sep 17 00:00:00 2001 From: Boris Brezillion Date: Wed, 28 Jan 2015 17:49:51 +0100 Subject: Add RGB444_1X12 and RGB565_1X16 media bus formats Add RGB444_1X12 and RGB565_1X16 format definitions and update the documentation. Signed-off-by: Boris Brezillon Acked-by: Mauro Carvalho Chehab Acked-by: Sakari Ailus Acked-by: Laurent Pinchart Acked-by: Hans Verkuil Signed-off-by: Philipp Zabel --- Documentation/DocBook/media/v4l/subdev-formats.xml | 40 ++++++++++++++++++++++ include/uapi/linux/media-bus-format.h | 4 ++- 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index c5ea868e3909..29fe60112f4c 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -192,6 +192,24 @@ see . + + MEDIA_BUS_FMT_RGB444_1X12 + 0x100e + + &dash-ent-20; + r3 + r2 + r1 + r0 + g3 + g2 + g1 + g0 + b3 + b2 + b1 + b0 + MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 @@ -304,6 +322,28 @@ see . g4 g3 + + MEDIA_BUS_FMT_RGB565_1X16 + 0x100f + + &dash-ent-16; + r4 + r3 + r2 + r1 + r0 + g5 + g4 + g3 + g2 + g1 + g0 + b4 + b3 + b2 + b1 + b0 + MEDIA_BUS_FMT_BGR565_2X8_BE 0x1005 diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 23b40908be30..37091c668f65 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -33,11 +33,13 @@ #define MEDIA_BUS_FMT_FIXED 0x0001 -/* RGB - next is 0x100e */ +/* RGB - next is 0x1010 */ +#define MEDIA_BUS_FMT_RGB444_1X12 0x100e #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE 0x1003 #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_LE 0x1004 +#define MEDIA_BUS_FMT_RGB565_1X16 0x100f #define MEDIA_BUS_FMT_BGR565_2X8_BE 0x1005 #define MEDIA_BUS_FMT_BGR565_2X8_LE 0x1006 #define MEDIA_BUS_FMT_RGB565_2X8_BE 0x1007 -- cgit v1.2.3 From b295c22978b86fc62019d12f4108b68b7e795610 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 2 Dec 2014 17:49:04 +0100 Subject: Add LVDS RGB media bus formats This patch adds three new RGB media bus formats that describe 18-bit or 24-bit samples transferred over an LVDS bus with three or four differential data pairs, serialized into 7 time slots, using standard SPWG/PSWG/VESA or JEIDA data ordering. Signed-off-by: Philipp Zabel Acked-by: Sakari Ailus Acked-by: Hans Verkuil --- Documentation/DocBook/media/v4l/subdev-formats.xml | 255 +++++++++++++++++++++ include/uapi/linux/media-bus-format.h | 5 +- 2 files changed, 259 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index 29fe60112f4c..18449b32f240 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -622,6 +622,261 @@ see . + + On LVDS buses, usually each sample is transferred serialized in + seven time slots per pixel clock, on three (18-bit) or four (24-bit) + differential data pairs at the same time. The remaining bits are used for + control signals as defined by SPWG/PSWG/VESA or JEIDA standards. + The 24-bit RGB format serialized in seven time slots on four lanes using + JEIDA defined bit mapping will be named + MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA, for example. + + + + LVDS RGB formats + + + + + + + + + + + + + Identifier + Code + + + Data organization + + + + + Timeslot + Lane + 3 + 2 + 1 + 0 + + + + + MEDIA_BUS_FMT_RGB666_1X7X3_SPWG + 0x1010 + 0 + + - + d + b1 + g0 + + + + + 1 + + - + d + b0 + r5 + + + + + 2 + + - + d + g5 + r4 + + + + + 3 + + - + b5 + g4 + r3 + + + + + 4 + + - + b4 + g3 + r2 + + + + + 5 + + - + b3 + g2 + r1 + + + + + 6 + + - + b2 + g1 + r0 + + + MEDIA_BUS_FMT_RGB888_1X7X4_SPWG + 0x1011 + 0 + + d + d + b1 + g0 + + + + + 1 + + b7 + d + b0 + r5 + + + + + 2 + + b6 + d + g5 + r4 + + + + + 3 + + g7 + b5 + g4 + r3 + + + + + 4 + + g6 + b4 + g3 + r2 + + + + + 5 + + r7 + b3 + g2 + r1 + + + + + 6 + + r6 + b2 + g1 + r0 + + + MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA + 0x1012 + 0 + + d + d + b3 + g2 + + + + + 1 + + b1 + d + b2 + r7 + + + + + 2 + + b0 + d + g7 + r6 + + + + + 3 + + g1 + b7 + g6 + r5 + + + + + 4 + + g0 + b6 + g5 + r4 + + + + + 5 + + r1 + b5 + g4 + r3 + + + + + 6 + + r0 + b4 + g3 + r2 + + + +
diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 37091c668f65..3fb9cbbb603f 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -33,7 +33,7 @@ #define MEDIA_BUS_FMT_FIXED 0x0001 -/* RGB - next is 0x1010 */ +/* RGB - next is 0x1013 */ #define MEDIA_BUS_FMT_RGB444_1X12 0x100e #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 @@ -45,9 +45,12 @@ #define MEDIA_BUS_FMT_RGB565_2X8_BE 0x1007 #define MEDIA_BUS_FMT_RGB565_2X8_LE 0x1008 #define MEDIA_BUS_FMT_RGB666_1X18 0x1009 +#define MEDIA_BUS_FMT_RGB666_1X7X3_SPWG 0x1010 #define MEDIA_BUS_FMT_RGB888_1X24 0x100a #define MEDIA_BUS_FMT_RGB888_2X12_BE 0x100b #define MEDIA_BUS_FMT_RGB888_2X12_LE 0x100c +#define MEDIA_BUS_FMT_RGB888_1X7X4_SPWG 0x1011 +#define MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA 0x1012 #define MEDIA_BUS_FMT_ARGB8888_1X32 0x100d /* YUV (including grey) - next is 0x2024 */ -- cgit v1.2.3 From 08c38458be7efa36a1d2dffd40500448e46d29c5 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 2 Dec 2014 15:45:25 +0100 Subject: Add BGR888_1X24 and GBR888_1X24 media bus formats This patch adds two more 24-bit RGB formats. BGR888 is more or less common, GBR888 is used on the internal connection between the IPU display interface and the TVE (VGA DAC) on i.MX53 SoCs. Signed-off-by: Philipp Zabel Acked-by: Laurent Pinchart Acked-by: Hans Verkuil --- Documentation/DocBook/media/v4l/subdev-formats.xml | 60 ++++++++++++++++++++++ include/uapi/linux/media-bus-format.h | 4 +- 2 files changed, 63 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index 18449b32f240..805cbe1acab7 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -480,6 +480,66 @@ see . b1 b0 + + MEDIA_BUS_FMT_BGR888_1X24 + 0x1013 + + &dash-ent-8; + b7 + b6 + b5 + b4 + b3 + b2 + b1 + b0 + g7 + g6 + g5 + g4 + g3 + g2 + g1 + g0 + r7 + r6 + r5 + r4 + r3 + r2 + r1 + r0 + + + MEDIA_BUS_FMT_GBR888_1X24 + 0x1014 + + &dash-ent-8; + g7 + g6 + g5 + g4 + g3 + g2 + g1 + g0 + b7 + b6 + b5 + b4 + b3 + b2 + b1 + b0 + r7 + r6 + r5 + r4 + r3 + r2 + r1 + r0 + MEDIA_BUS_FMT_RGB888_1X24 0x100a diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 3fb9cbbb603f..6f6942e8dae6 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -33,7 +33,7 @@ #define MEDIA_BUS_FMT_FIXED 0x0001 -/* RGB - next is 0x1013 */ +/* RGB - next is 0x1015 */ #define MEDIA_BUS_FMT_RGB444_1X12 0x100e #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 @@ -46,6 +46,8 @@ #define MEDIA_BUS_FMT_RGB565_2X8_LE 0x1008 #define MEDIA_BUS_FMT_RGB666_1X18 0x1009 #define MEDIA_BUS_FMT_RGB666_1X7X3_SPWG 0x1010 +#define MEDIA_BUS_FMT_BGR888_1X24 0x1013 +#define MEDIA_BUS_FMT_GBR888_1X24 0x1014 #define MEDIA_BUS_FMT_RGB888_1X24 0x100a #define MEDIA_BUS_FMT_RGB888_2X12_BE 0x100b #define MEDIA_BUS_FMT_RGB888_2X12_LE 0x100c -- cgit v1.2.3 From 0fc63eb104d76e20654cd97eeb1bfd0f35bc3d3a Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 3 Dec 2014 11:01:54 +0100 Subject: Add YUV8_1X24 media bus format This patch adds the media bus format for a 24-bit bus format with three 8-bit YUV components. Signed-off-by: Philipp Zabel Acked-by: Laurent Pinchart Acked-by: Hans Verkuil --- Documentation/DocBook/media/v4l/subdev-formats.xml | 37 ++++++++++++++++++++++ include/uapi/linux/media-bus-format.h | 3 +- 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index 805cbe1acab7..8d1f62402f88 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -3015,6 +3015,43 @@ see . u1 u0 + + MEDIA_BUS_FMT_YUV8_1X24 + 0x2024 + + - + - + - + - + - + - + - + - + y7 + y6 + y5 + y4 + y3 + y2 + y1 + y0 + u7 + u6 + u5 + u4 + u3 + u2 + u1 + u0 + v7 + v6 + v5 + v4 + v3 + v2 + v1 + v0 + MEDIA_BUS_FMT_YUV10_1X30 0x2016 diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 6f6942e8dae6..8dbf16cc5d1c 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -55,7 +55,7 @@ #define MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA 0x1012 #define MEDIA_BUS_FMT_ARGB8888_1X32 0x100d -/* YUV (including grey) - next is 0x2024 */ +/* YUV (including grey) - next is 0x2025 */ #define MEDIA_BUS_FMT_Y8_1X8 0x2001 #define MEDIA_BUS_FMT_UV8_1X8 0x2015 #define MEDIA_BUS_FMT_UYVY8_1_5X8 0x2002 @@ -81,6 +81,7 @@ #define MEDIA_BUS_FMT_VYUY10_1X20 0x201b #define MEDIA_BUS_FMT_YUYV10_1X20 0x200d #define MEDIA_BUS_FMT_YVYU10_1X20 0x200e +#define MEDIA_BUS_FMT_YUV8_1X24 0x2024 #define MEDIA_BUS_FMT_YUV10_1X30 0x2016 #define MEDIA_BUS_FMT_AYUV8_1X32 0x2017 #define MEDIA_BUS_FMT_UYVY12_2X12 0x201c -- cgit v1.2.3 From 203508ef52e3fee93b71262928541ecea82c735d Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 2 Dec 2014 17:56:04 +0100 Subject: Add RGB666_1X24_CPADHI media bus format Commit 9e74d2926a28 ("staging: imx-drm: add LVDS666 support for parallel display") describes a 24-bit bus format where three 6-bit components each take the lower part of 8 bits with the two high bits zero padded. Add a component-wise padded media bus format RGB666_1X24_CPADHI to support this connection. Signed-off-by: Philipp Zabel Acked-by: Hans Verkuil Tested-by: Emil Renner Berthing --- Documentation/DocBook/media/v4l/subdev-formats.xml | 34 +++++++++++++++++++++- include/uapi/linux/media-bus-format.h | 3 +- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index 8d1f62402f88..18b71aff48c9 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -91,7 +91,9 @@ see . For formats where the total number of bits per pixel is smaller than the number of bus samples per pixel times the bus width, a padding value stating if the bytes are padded in their most high order bits - (PADHI) or low order bits (PADLO). + (PADHI) or low order bits (PADLO). A "C" prefix is used for component-wise + padding in the most high order bits (CPADHI) or low order bits (CPADLO) + of each separate component. For formats where the number of bus samples per pixel is larger than 1, an endianness value stating if the pixel is transferred MSB first (BE) or LSB first (LE). @@ -480,6 +482,36 @@ see . b1 b0 + + MEDIA_BUS_FMT_RGB666_1X24_CPADHI + 0x1015 + + &dash-ent-8; + 0 + 0 + r5 + r4 + r3 + r2 + r1 + r0 + 0 + 0 + g5 + g4 + g3 + g2 + g1 + g0 + 0 + 0 + b5 + b4 + b3 + b2 + b1 + b0 + MEDIA_BUS_FMT_BGR888_1X24 0x1013 diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 8dbf16cc5d1c..83ea46f4be51 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -33,7 +33,7 @@ #define MEDIA_BUS_FMT_FIXED 0x0001 -/* RGB - next is 0x1015 */ +/* RGB - next is 0x1016 */ #define MEDIA_BUS_FMT_RGB444_1X12 0x100e #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 @@ -45,6 +45,7 @@ #define MEDIA_BUS_FMT_RGB565_2X8_BE 0x1007 #define MEDIA_BUS_FMT_RGB565_2X8_LE 0x1008 #define MEDIA_BUS_FMT_RGB666_1X18 0x1009 +#define MEDIA_BUS_FMT_RGB666_1X24_CPADHI 0x1015 #define MEDIA_BUS_FMT_RGB666_1X7X3_SPWG 0x1010 #define MEDIA_BUS_FMT_BGR888_1X24 0x1013 #define MEDIA_BUS_FMT_GBR888_1X24 0x1014 -- cgit v1.2.3 From 15e318bdc6dfb82914c82fb7ad00badaa8387d8e Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Sun, 29 Mar 2015 16:59:24 +0200 Subject: xfrm: simplify xfrm_address_t use In many places, the a6 field is typecasted to struct in6_addr. As the fields are in union anyway, just add in6_addr type to the union and get rid of the typecasting. Modifying the uapi header is okay, the union has still the same size. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/xfrm.h | 6 +++--- include/uapi/linux/xfrm.h | 2 ++ net/ipv6/xfrm6_mode_beet.c | 4 ++-- net/ipv6/xfrm6_policy.c | 4 +--- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d0ac7d7be8a7..461f83539493 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1025,7 +1025,7 @@ xfrm_addr_any(const xfrm_address_t *addr, unsigned short family) case AF_INET: return addr->a4 == 0; case AF_INET6: - return ipv6_addr_any((struct in6_addr *)&addr->a6); + return ipv6_addr_any(&addr->in6); } return 0; } @@ -1238,8 +1238,8 @@ void xfrm_flowi_addr_get(const struct flowi *fl, memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4)); break; case AF_INET6: - *(struct in6_addr *)saddr->a6 = fl->u.ip6.saddr; - *(struct in6_addr *)daddr->a6 = fl->u.ip6.daddr; + saddr->in6 = fl->u.ip6.saddr; + daddr->in6 = fl->u.ip6.daddr; break; } } diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 02d5125a5ee8..2cd9e608d0d1 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -1,6 +1,7 @@ #ifndef _LINUX_XFRM_H #define _LINUX_XFRM_H +#include #include /* All of the structures in this file may not change size as they are @@ -13,6 +14,7 @@ typedef union { __be32 a4; __be32 a6[4]; + struct in6_addr in6; } xfrm_address_t; /* Ident of a specific xfrm_state. It is used on input to lookup diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index 9949a356d62c..1e205c3253ac 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -95,8 +95,8 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); - ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; - ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; + ip6h->daddr = x->sel.daddr.in6; + ip6h->saddr = x->sel.saddr.in6; err = 0; out: return err; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 91d934c22a2a..f337a908a76a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -61,9 +61,7 @@ static int xfrm6_get_saddr(struct net *net, return -EHOSTUNREACH; dev = ip6_dst_idev(dst)->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, - (struct in6_addr *)&daddr->a6, 0, - (struct in6_addr *)&saddr->a6); + ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; } diff --git a/net/key/af_key.c b/net/key/af_key.c index 9255fd9d94bc..f0d52d721b3a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -709,7 +709,7 @@ static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; - sin6->sin6_addr = *(struct in6_addr *)xaddr->a6; + sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index de971b6d38c5..f5e39e35d73a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1043,12 +1043,12 @@ static struct xfrm_state *__find_acq_core(struct net *net, break; case AF_INET6: - *(struct in6_addr *)x->sel.daddr.a6 = *(struct in6_addr *)daddr; - *(struct in6_addr *)x->sel.saddr.a6 = *(struct in6_addr *)saddr; + x->sel.daddr.in6 = daddr->in6; + x->sel.saddr.in6 = saddr->in6; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; - *(struct in6_addr *)x->props.saddr.a6 = *(struct in6_addr *)saddr; - *(struct in6_addr *)x->id.daddr.a6 = *(struct in6_addr *)daddr; + x->props.saddr.in6 = saddr->in6; + x->id.daddr.in6 = daddr->in6; break; } -- cgit v1.2.3 From 47b43c52ee4b0425449d1b2b1eedca7f6b7a578a Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Tue, 11 Nov 2014 20:57:06 +0100 Subject: KVM: s390: add ioctl to inject local interrupts We have introduced struct kvm_s390_irq a while ago which allows to inject all kinds of interrupts as defined in the Principles of Operation. Add ioctl to inject interrupts with the extended struct kvm_s390_irq Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- Documentation/virtual/kvm/api.txt | 56 +++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 10 +++++++ include/uapi/linux/kvm.h | 3 +++ virt/kvm/kvm_main.c | 2 +- 4 files changed, 70 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0d7fc66289a0..a7c651d0dc63 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2820,6 +2820,62 @@ single frame starting at start_gfn for count frames. Note: If any architecturally invalid key value is found in the given data then the ioctl will return -EINVAL. +4.92 KVM_S390_IRQ + +Capability: KVM_CAP_S390_INJECT_IRQ +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq (in) +Returns: 0 on success, -1 on error +Errors: + EINVAL: interrupt type is invalid + type is KVM_S390_SIGP_STOP and flag parameter is invalid value + type is KVM_S390_INT_EXTERNAL_CALL and code is bigger + than the maximum of VCPUs + EBUSY: type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped + type is KVM_S390_SIGP_STOP and a stop irq is already pending + type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt + is already pending + +Allows to inject an interrupt to the guest. + +Using struct kvm_s390_irq as a parameter allows +to inject additional payload which is not +possible via KVM_S390_INTERRUPT. + +Interrupt parameters are passed via kvm_s390_irq: + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +type can be one of the following: + +KVM_S390_SIGP_STOP - sigp stop; parameter in .stop +KVM_S390_PROGRAM_INT - program check; parameters in .pgm +KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix +KVM_S390_RESTART - restart; no parameters +KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters +KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters +KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg +KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall +KVM_S390_MCHK - machine check interrupt; parameters in .mchk + + +Note that the vcpu ioctl is asynchronous to vcpu execution. + + 5. The kvm_run structure ------------------------ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dbc9ca34d9da..8bc25d405edf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -177,6 +177,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: + case KVM_CAP_S390_INJECT_IRQ: case KVM_CAP_S390_USER_SIGP: case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: @@ -2391,6 +2392,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, long r; switch (ioctl) { + case KVM_S390_IRQ: { + struct kvm_s390_irq s390irq; + + r = -EFAULT; + if (copy_from_user(&s390irq, argp, sizeof(s390irq))) + break; + r = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; + } case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; struct kvm_s390_irq s390irq; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 1162ef7a3fa1..c0632e87a00f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -802,6 +802,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_MEM_OP 108 #define KVM_CAP_S390_USER_STSI 109 #define KVM_CAP_S390_SKEYS 110 +#define KVM_CAP_S390_INJECT_IRQ 113 #ifdef KVM_CAP_IRQ_ROUTING @@ -1182,6 +1183,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_S390_SKEYS */ #define KVM_S390_GET_SKEYS _IOW(KVMIO, 0xb2, struct kvm_s390_skeys) #define KVM_S390_SET_SKEYS _IOW(KVMIO, 0xb3, struct kvm_s390_skeys) +/* Available with KVM_CAP_S390_INJECT_IRQ */ +#define KVM_S390_IRQ _IOW(KVMIO, 0xb4, struct kvm_s390_irq) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a1093700f3a4..34310a8d24b9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2118,7 +2118,7 @@ static long kvm_vcpu_ioctl(struct file *filp, * Special cases: vcpu ioctls that are asynchronous to vcpu execution, * so vcpu_load() would break it. */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) + if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) return kvm_arch_vcpu_ioctl(filp, ioctl, arg); #endif -- cgit v1.2.3 From 816c7667ea97c61884e014cfeedaede5b67b0e58 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 24 Nov 2014 17:13:46 +0100 Subject: KVM: s390: migrate vcpu interrupt state This patch adds support to migrate vcpu interrupts. Two new vcpu ioctls are added which get/set the complete status of pending interrupts in one go. The ioctls are marked as available with the new capability KVM_CAP_S390_IRQ_STATE. We can not use a ONEREG, as the number of pending local interrupts is not constant and depends on the number of CPUs. To retrieve the interrupt state we add an ioctl KVM_S390_GET_IRQ_STATE. Its input parameter is a pointer to a struct kvm_s390_irq_state which has a buffer and length. For all currently pending interrupts, we copy a struct kvm_s390_irq into the buffer and pass it to userspace. To store interrupt state into a buffer provided by userspace, we add an ioctl KVM_S390_SET_IRQ_STATE. It passes a struct kvm_s390_irq_state into the kernel and injects all interrupts contained in the buffer. Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- Documentation/virtual/kvm/api.txt | 61 +++++++++++++++++ arch/s390/kvm/interrupt.c | 140 ++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 36 ++++++++++ arch/s390/kvm/kvm-s390.h | 4 ++ include/uapi/linux/kvm.h | 11 +++ 5 files changed, 252 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a7c651d0dc63..18fb7630e2ad 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2875,6 +2875,67 @@ KVM_S390_MCHK - machine check interrupt; parameters in .mchk Note that the vcpu ioctl is asynchronous to vcpu execution. +4.94 KVM_S390_GET_IRQ_STATE + +Capability: KVM_CAP_S390_IRQ_STATE +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq_state (out) +Returns: >= number of bytes copied into buffer, + -EINVAL if buffer size is 0, + -ENOBUFS if buffer size is too small to fit all pending interrupts, + -EFAULT if the buffer address was invalid + +This ioctl allows userspace to retrieve the complete state of all currently +pending interrupts in a single buffer. Use cases include migration +and introspection. The parameter structure contains the address of a +userspace buffer and its length: + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; + __u32 len; + __u32 reserved[4]; +}; + +Userspace passes in the above struct and for each pending interrupt a +struct kvm_s390_irq is copied to the provided buffer. + +If -ENOBUFS is returned the buffer provided was too small and userspace +may retry with a bigger buffer. + +4.95 KVM_S390_SET_IRQ_STATE + +Capability: KVM_CAP_S390_IRQ_STATE +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq_state (in) +Returns: 0 on success, + -EFAULT if the buffer address was invalid, + -EINVAL for an invalid buffer length (see below), + -EBUSY if there were already interrupts pending, + errors occurring when actually injecting the + interrupt. See KVM_S390_IRQ. + +This ioctl allows userspace to set the complete state of all cpu-local +interrupts currently pending for the vcpu. It is intended for restoring +interrupt state after a migration. The input parameter is a userspace buffer +containing a struct kvm_s390_irq_state: + +struct kvm_s390_irq_state { + __u64 buf; + __u32 len; + __u32 pad; +}; + +The userspace memory referenced by buf contains a struct kvm_s390_irq +for each interrupt to be injected into the guest. +If one of the interrupts could not be injected for some reason the +ioctl aborts. + +len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 +and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), +which is the maximum number of possibly pending cpu-local interrupts. 5. The kvm_run structure ------------------------ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index bc0988093c5b..9de47265ef73 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2123,3 +2123,143 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, { return -EINVAL; } + +int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, void __user *irqstate, int len) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_irq *buf; + int r = 0; + int n; + + buf = vmalloc(len); + if (!buf) + return -ENOMEM; + + if (copy_from_user((void *) buf, irqstate, len)) { + r = -EFAULT; + goto out_free; + } + + /* + * Don't allow setting the interrupt state + * when there are already interrupts pending + */ + spin_lock(&li->lock); + if (li->pending_irqs) { + r = -EBUSY; + goto out_unlock; + } + + for (n = 0; n < len / sizeof(*buf); n++) { + r = do_inject_vcpu(vcpu, &buf[n]); + if (r) + break; + } + +out_unlock: + spin_unlock(&li->lock); +out_free: + vfree(buf); + + return r; +} + +static void store_local_irq(struct kvm_s390_local_interrupt *li, + struct kvm_s390_irq *irq, + unsigned long irq_type) +{ + switch (irq_type) { + case IRQ_PEND_MCHK_EX: + case IRQ_PEND_MCHK_REP: + irq->type = KVM_S390_MCHK; + irq->u.mchk = li->irq.mchk; + break; + case IRQ_PEND_PROG: + irq->type = KVM_S390_PROGRAM_INT; + irq->u.pgm = li->irq.pgm; + break; + case IRQ_PEND_PFAULT_INIT: + irq->type = KVM_S390_INT_PFAULT_INIT; + irq->u.ext = li->irq.ext; + break; + case IRQ_PEND_EXT_EXTERNAL: + irq->type = KVM_S390_INT_EXTERNAL_CALL; + irq->u.extcall = li->irq.extcall; + break; + case IRQ_PEND_EXT_CLOCK_COMP: + irq->type = KVM_S390_INT_CLOCK_COMP; + break; + case IRQ_PEND_EXT_CPU_TIMER: + irq->type = KVM_S390_INT_CPU_TIMER; + break; + case IRQ_PEND_SIGP_STOP: + irq->type = KVM_S390_SIGP_STOP; + irq->u.stop = li->irq.stop; + break; + case IRQ_PEND_RESTART: + irq->type = KVM_S390_RESTART; + break; + case IRQ_PEND_SET_PREFIX: + irq->type = KVM_S390_SIGP_SET_PREFIX; + irq->u.prefix = li->irq.prefix; + break; + } +} + +int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) +{ + uint8_t sigp_ctrl = vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl; + unsigned long sigp_emerg_pending[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + unsigned long pending_irqs; + struct kvm_s390_irq irq; + unsigned long irq_type; + int cpuaddr; + int n = 0; + + spin_lock(&li->lock); + pending_irqs = li->pending_irqs; + memcpy(&sigp_emerg_pending, &li->sigp_emerg_pending, + sizeof(sigp_emerg_pending)); + spin_unlock(&li->lock); + + for_each_set_bit(irq_type, &pending_irqs, IRQ_PEND_COUNT) { + memset(&irq, 0, sizeof(irq)); + if (irq_type == IRQ_PEND_EXT_EMERGENCY) + continue; + if (n + sizeof(irq) > len) + return -ENOBUFS; + store_local_irq(&vcpu->arch.local_int, &irq, irq_type); + if (copy_to_user(&buf[n], &irq, sizeof(irq))) + return -EFAULT; + n += sizeof(irq); + } + + if (test_bit(IRQ_PEND_EXT_EMERGENCY, &pending_irqs)) { + for_each_set_bit(cpuaddr, sigp_emerg_pending, KVM_MAX_VCPUS) { + memset(&irq, 0, sizeof(irq)); + if (n + sizeof(irq) > len) + return -ENOBUFS; + irq.type = KVM_S390_INT_EMERGENCY; + irq.u.emerg.code = cpuaddr; + if (copy_to_user(&buf[n], &irq, sizeof(irq))) + return -EFAULT; + n += sizeof(irq); + } + } + + if ((sigp_ctrl & SIGP_CTRL_C) && + (atomic_read(&vcpu->arch.sie_block->cpuflags) & + CPUSTAT_ECALL_PEND)) { + if (n + sizeof(irq) > len) + return -ENOBUFS; + memset(&irq, 0, sizeof(irq)); + irq.type = KVM_S390_INT_EXTERNAL_CALL; + irq.u.extcall.code = sigp_ctrl & SIGP_CTRL_SCN_MASK; + if (copy_to_user(&buf[n], &irq, sizeof(irq))) + return -EFAULT; + n += sizeof(irq); + } + + return n; +} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8bc25d405edf..3040b14751b8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -41,6 +41,9 @@ #include "trace-s390.h" #define MEM_OP_MAX_SIZE 65536 /* Maximum transfer size for KVM_S390_MEM_OP */ +#define LOCAL_IRQS 32 +#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ + (KVM_MAX_VCPUS + LOCAL_IRQS)) #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -181,6 +184,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_SIGP: case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: + case KVM_CAP_S390_IRQ_STATE: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -2500,6 +2504,38 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_S390_SET_IRQ_STATE: { + struct kvm_s390_irq_state irq_state; + + r = -EFAULT; + if (copy_from_user(&irq_state, argp, sizeof(irq_state))) + break; + if (irq_state.len > VCPU_IRQS_MAX_BUF || + irq_state.len == 0 || + irq_state.len % sizeof(struct kvm_s390_irq) > 0) { + r = -EINVAL; + break; + } + r = kvm_s390_set_irq_state(vcpu, + (void __user *) irq_state.buf, + irq_state.len); + break; + } + case KVM_S390_GET_IRQ_STATE: { + struct kvm_s390_irq_state irq_state; + + r = -EFAULT; + if (copy_from_user(&irq_state, argp, sizeof(irq_state))) + break; + if (irq_state.len == 0) { + r = -EINVAL; + break; + } + r = kvm_s390_get_irq_state(vcpu, + (__u8 __user *) irq_state.buf, + irq_state.len); + break; + } default: r = -ENOTTY; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 343644a59392..ca108b90ae56 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -272,6 +272,10 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu); extern struct kvm_device_ops kvm_flic_ops; int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu); void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu); +int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, + void __user *buf, int len); +int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, + __u8 __user *buf, int len); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c0632e87a00f..c045c725e521 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -558,6 +558,13 @@ struct kvm_s390_irq { } u; }; +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; + __u32 len; + __u32 reserved[4]; +}; + /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 @@ -803,6 +810,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_STSI 109 #define KVM_CAP_S390_SKEYS 110 #define KVM_CAP_S390_INJECT_IRQ 113 +#define KVM_CAP_S390_IRQ_STATE 114 #ifdef KVM_CAP_IRQ_ROUTING @@ -1185,6 +1193,9 @@ struct kvm_s390_ucas_mapping { #define KVM_S390_SET_SKEYS _IOW(KVMIO, 0xb3, struct kvm_s390_skeys) /* Available with KVM_CAP_S390_INJECT_IRQ */ #define KVM_S390_IRQ _IOW(KVMIO, 0xb4, struct kvm_s390_irq) +/* Available with KVM_CAP_S390_IRQ_STATE */ +#define KVM_S390_SET_IRQ_STATE _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state) +#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -- cgit v1.2.3 From 7f62fe8a5851db94e10d8d956c123d4011aaeed9 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Wed, 2 Jun 2010 22:23:34 +0300 Subject: HSI: cmt_speech: Add cmt-speech driver Introduces the cmt-speech driver, which implements a character device interface for transferring speech data frames over HSI/SSI. The driver is used to exchange voice/speech data between the Nokia N900/N950/N9's modem and its cpu. Signed-off-by: Kai Vehmanen Signed-off-by: Carlos Chinea Signed-off-by: Joni Lapilainen Since the original driver has been written for 2.6.28 some build fixes and general cleanups have been added by me: * fix build for 4.0 kernel * replace GFP_ATOMIC with GFP_KERNEL in cs_alloc_cmds() * add sanity check for CS_SET_WAKELINE ioctl * cleanup driver initialisation * rename driver to cmt-speech to be consistent with ssi-protocol driver * move cs-protocol.h to include/uapi/linux/hsi, since it describes a userspace API * replace hardcoded channels numbers with values provided via the HSI framework (e.g. coming from DT) Acked-by: Aaro Koskinen Tested-by: Pavel Machek Signed-off-by: Sebastian Reichel --- drivers/hsi/clients/Kconfig | 10 + drivers/hsi/clients/Makefile | 1 + drivers/hsi/clients/cmt_speech.c | 1456 ++++++++++++++++++++++++++++++++++ include/uapi/linux/hsi/Kbuild | 2 +- include/uapi/linux/hsi/cs-protocol.h | 113 +++ 5 files changed, 1581 insertions(+), 1 deletion(-) create mode 100644 drivers/hsi/clients/cmt_speech.c create mode 100644 include/uapi/linux/hsi/cs-protocol.h (limited to 'include/uapi') diff --git a/drivers/hsi/clients/Kconfig b/drivers/hsi/clients/Kconfig index bc60dec3f586..86c849506f34 100644 --- a/drivers/hsi/clients/Kconfig +++ b/drivers/hsi/clients/Kconfig @@ -13,6 +13,16 @@ config NOKIA_MODEM If unsure, say N. +config CMT_SPEECH + tristate "CMT speech" + depends on HSI && SSI_PROTOCOL + help + If you say Y here, you will enable the CMT speech protocol used + by Nokia modems. If you say M the protocol will be available as + module named cmt_speech. + + If unsure, say N. + config SSI_PROTOCOL tristate "SSI protocol" depends on HSI && PHONET && OMAP_SSI diff --git a/drivers/hsi/clients/Makefile b/drivers/hsi/clients/Makefile index 4d5bc0e0b27b..260723266407 100644 --- a/drivers/hsi/clients/Makefile +++ b/drivers/hsi/clients/Makefile @@ -4,4 +4,5 @@ obj-$(CONFIG_NOKIA_MODEM) += nokia-modem.o obj-$(CONFIG_SSI_PROTOCOL) += ssi_protocol.o +obj-$(CONFIG_CMT_SPEECH) += cmt_speech.o obj-$(CONFIG_HSI_CHAR) += hsi_char.o diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c new file mode 100644 index 000000000000..e9560ef23092 --- /dev/null +++ b/drivers/hsi/clients/cmt_speech.c @@ -0,0 +1,1456 @@ +/* + * cmt_speech.c - HSI CMT speech driver + * + * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved. + * + * Contact: Kai Vehmanen + * Original author: Peter Ujfalusi + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CS_MMAP_SIZE PAGE_SIZE + +struct char_queue { + struct list_head list; + u32 msg; +}; + +struct cs_char { + unsigned int opened; + struct hsi_client *cl; + struct cs_hsi_iface *hi; + struct list_head chardev_queue; + struct list_head dataind_queue; + int dataind_pending; + /* mmap things */ + unsigned long mmap_base; + unsigned long mmap_size; + spinlock_t lock; + struct fasync_struct *async_queue; + wait_queue_head_t wait; + /* hsi channel ids */ + int channel_id_cmd; + int channel_id_data; +}; + +#define SSI_CHANNEL_STATE_READING 1 +#define SSI_CHANNEL_STATE_WRITING (1 << 1) +#define SSI_CHANNEL_STATE_POLL (1 << 2) +#define SSI_CHANNEL_STATE_ERROR (1 << 3) + +#define TARGET_MASK 0xf000000 +#define TARGET_REMOTE (1 << CS_DOMAIN_SHIFT) +#define TARGET_LOCAL 0 + +/* Number of pre-allocated commands buffers */ +#define CS_MAX_CMDS 4 + +/* + * During data transfers, transactions must be handled + * within 20ms (fixed value in cmtspeech HSI protocol) + */ +#define CS_QOS_LATENCY_FOR_DATA_USEC 20000 + +/* Timeout to wait for pending HSI transfers to complete */ +#define CS_HSI_TRANSFER_TIMEOUT_MS 500 + + +#define RX_PTR_BOUNDARY_SHIFT 8 +#define RX_PTR_MAX_SHIFT (RX_PTR_BOUNDARY_SHIFT + \ + CS_MAX_BUFFERS_SHIFT) +struct cs_hsi_iface { + struct hsi_client *cl; + struct hsi_client *master; + + unsigned int iface_state; + unsigned int wakeline_state; + unsigned int control_state; + unsigned int data_state; + + /* state exposed to application */ + struct cs_mmap_config_block *mmap_cfg; + + unsigned long mmap_base; + unsigned long mmap_size; + + unsigned int rx_slot; + unsigned int tx_slot; + + /* note: for security reasons, we do not trust the contents of + * mmap_cfg, but instead duplicate the variables here */ + unsigned int buf_size; + unsigned int rx_bufs; + unsigned int tx_bufs; + unsigned int rx_ptr_boundary; + unsigned int rx_offsets[CS_MAX_BUFFERS]; + unsigned int tx_offsets[CS_MAX_BUFFERS]; + + /* size of aligned memory blocks */ + unsigned int slot_size; + unsigned int flags; + + struct list_head cmdqueue; + + struct hsi_msg *data_rx_msg; + struct hsi_msg *data_tx_msg; + wait_queue_head_t datawait; + + struct pm_qos_request pm_qos_req; + + spinlock_t lock; +}; + +static struct cs_char cs_char_data; + +static void cs_hsi_read_on_control(struct cs_hsi_iface *hi); +static void cs_hsi_read_on_data(struct cs_hsi_iface *hi); + +static inline void rx_ptr_shift_too_big(void) +{ + BUILD_BUG_ON((1LLU << RX_PTR_MAX_SHIFT) > UINT_MAX); +} + +static void cs_notify(u32 message, struct list_head *head) +{ + struct char_queue *entry; + + spin_lock(&cs_char_data.lock); + + if (!cs_char_data.opened) { + spin_unlock(&cs_char_data.lock); + goto out; + } + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + dev_err(&cs_char_data.cl->device, + "Can't allocate new entry for the queue.\n"); + spin_unlock(&cs_char_data.lock); + goto out; + } + + entry->msg = message; + list_add_tail(&entry->list, head); + + spin_unlock(&cs_char_data.lock); + + wake_up_interruptible(&cs_char_data.wait); + kill_fasync(&cs_char_data.async_queue, SIGIO, POLL_IN); + +out: + return; +} + +static u32 cs_pop_entry(struct list_head *head) +{ + struct char_queue *entry; + u32 data; + + entry = list_entry(head->next, struct char_queue, list); + data = entry->msg; + list_del(&entry->list); + kfree(entry); + + return data; +} + +static void cs_notify_control(u32 message) +{ + cs_notify(message, &cs_char_data.chardev_queue); +} + +static void cs_notify_data(u32 message, int maxlength) +{ + cs_notify(message, &cs_char_data.dataind_queue); + + spin_lock(&cs_char_data.lock); + cs_char_data.dataind_pending++; + while (cs_char_data.dataind_pending > maxlength && + !list_empty(&cs_char_data.dataind_queue)) { + dev_dbg(&cs_char_data.cl->device, "data notification " + "queue overrun (%u entries)\n", cs_char_data.dataind_pending); + + cs_pop_entry(&cs_char_data.dataind_queue); + cs_char_data.dataind_pending--; + } + spin_unlock(&cs_char_data.lock); +} + +static inline void cs_set_cmd(struct hsi_msg *msg, u32 cmd) +{ + u32 *data = sg_virt(msg->sgt.sgl); + *data = cmd; +} + +static inline u32 cs_get_cmd(struct hsi_msg *msg) +{ + u32 *data = sg_virt(msg->sgt.sgl); + return *data; +} + +static void cs_release_cmd(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + + list_add_tail(&msg->link, &hi->cmdqueue); +} + +static void cs_cmd_destructor(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + + spin_lock(&hi->lock); + + dev_dbg(&cs_char_data.cl->device, "control cmd destructor\n"); + + if (hi->iface_state != CS_STATE_CLOSED) + dev_err(&hi->cl->device, "Cmd flushed while driver active\n"); + + if (msg->ttype == HSI_MSG_READ) + hi->control_state &= + ~(SSI_CHANNEL_STATE_POLL | SSI_CHANNEL_STATE_READING); + else if (msg->ttype == HSI_MSG_WRITE && + hi->control_state & SSI_CHANNEL_STATE_WRITING) + hi->control_state &= ~SSI_CHANNEL_STATE_WRITING; + + cs_release_cmd(msg); + + spin_unlock(&hi->lock); +} + +static struct hsi_msg *cs_claim_cmd(struct cs_hsi_iface* ssi) +{ + struct hsi_msg *msg; + + BUG_ON(list_empty(&ssi->cmdqueue)); + + msg = list_first_entry(&ssi->cmdqueue, struct hsi_msg, link); + list_del(&msg->link); + msg->destructor = cs_cmd_destructor; + + return msg; +} + +static void cs_free_cmds(struct cs_hsi_iface *ssi) +{ + struct hsi_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &ssi->cmdqueue, link) { + list_del(&msg->link); + msg->destructor = NULL; + kfree(sg_virt(msg->sgt.sgl)); + hsi_free_msg(msg); + } +} + +static int cs_alloc_cmds(struct cs_hsi_iface *hi) +{ + struct hsi_msg *msg; + u32 *buf; + unsigned int i; + + INIT_LIST_HEAD(&hi->cmdqueue); + + for (i = 0; i < CS_MAX_CMDS; i++) { + msg = hsi_alloc_msg(1, GFP_KERNEL); + if (!msg) + goto out; + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + hsi_free_msg(msg); + goto out; + } + sg_init_one(msg->sgt.sgl, buf, sizeof(*buf)); + msg->channel = cs_char_data.channel_id_cmd; + msg->context = hi; + list_add_tail(&msg->link, &hi->cmdqueue); + } + + return 0; + +out: + cs_free_cmds(hi); + return -ENOMEM; +} + +static void cs_hsi_data_destructor(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + const char *dir = (msg->ttype == HSI_MSG_READ) ? "TX" : "RX"; + + dev_dbg(&cs_char_data.cl->device, "Freeing data %s message\n", dir); + + spin_lock(&hi->lock); + if (hi->iface_state != CS_STATE_CLOSED) + dev_err(&cs_char_data.cl->device, + "Data %s flush while device active\n", dir); + if (msg->ttype == HSI_MSG_READ) + hi->data_state &= + ~(SSI_CHANNEL_STATE_POLL | SSI_CHANNEL_STATE_READING); + else + hi->data_state &= ~SSI_CHANNEL_STATE_WRITING; + + msg->status = HSI_STATUS_COMPLETED; + if (unlikely(waitqueue_active(&hi->datawait))) + wake_up_interruptible(&hi->datawait); + + spin_unlock(&hi->lock); +} + +static int cs_hsi_alloc_data(struct cs_hsi_iface *hi) +{ + struct hsi_msg *txmsg, *rxmsg; + int res = 0; + + rxmsg = hsi_alloc_msg(1, GFP_KERNEL); + if (!rxmsg) { + res = -ENOMEM; + goto out1; + } + rxmsg->channel = cs_char_data.channel_id_data; + rxmsg->destructor = cs_hsi_data_destructor; + rxmsg->context = hi; + + txmsg = hsi_alloc_msg(1, GFP_KERNEL); + if (!txmsg) { + res = -ENOMEM; + goto out2; + } + txmsg->channel = cs_char_data.channel_id_data; + txmsg->destructor = cs_hsi_data_destructor; + txmsg->context = hi; + + hi->data_rx_msg = rxmsg; + hi->data_tx_msg = txmsg; + + return 0; + +out2: + hsi_free_msg(rxmsg); +out1: + return res; +} + +static void cs_hsi_free_data_msg(struct hsi_msg *msg) +{ + WARN_ON(msg->status != HSI_STATUS_COMPLETED && + msg->status != HSI_STATUS_ERROR); + hsi_free_msg(msg); +} + +static void cs_hsi_free_data(struct cs_hsi_iface *hi) +{ + cs_hsi_free_data_msg(hi->data_rx_msg); + cs_hsi_free_data_msg(hi->data_tx_msg); +} + +static inline void __cs_hsi_error_pre(struct cs_hsi_iface *hi, + struct hsi_msg *msg, const char *info, + unsigned int *state) +{ + spin_lock(&hi->lock); + dev_err(&hi->cl->device, "HSI %s error, msg %d, state %u\n", + info, msg->status, *state); +} + +static inline void __cs_hsi_error_post(struct cs_hsi_iface *hi) +{ + spin_unlock(&hi->lock); +} + +static inline void __cs_hsi_error_read_bits(unsigned int *state) +{ + *state |= SSI_CHANNEL_STATE_ERROR; + *state &= ~(SSI_CHANNEL_STATE_READING | SSI_CHANNEL_STATE_POLL); +} + +static inline void __cs_hsi_error_write_bits(unsigned int *state) +{ + *state |= SSI_CHANNEL_STATE_ERROR; + *state &= ~SSI_CHANNEL_STATE_WRITING; +} + +static void cs_hsi_control_read_error(struct cs_hsi_iface *hi, + struct hsi_msg *msg) +{ + __cs_hsi_error_pre(hi, msg, "control read", &hi->control_state); + cs_release_cmd(msg); + __cs_hsi_error_read_bits(&hi->control_state); + __cs_hsi_error_post(hi); +} + +static void cs_hsi_control_write_error(struct cs_hsi_iface *hi, + struct hsi_msg *msg) +{ + __cs_hsi_error_pre(hi, msg, "control write", &hi->control_state); + cs_release_cmd(msg); + __cs_hsi_error_write_bits(&hi->control_state); + __cs_hsi_error_post(hi); + +} + +static void cs_hsi_data_read_error(struct cs_hsi_iface *hi, struct hsi_msg *msg) +{ + __cs_hsi_error_pre(hi, msg, "data read", &hi->data_state); + __cs_hsi_error_read_bits(&hi->data_state); + __cs_hsi_error_post(hi); +} + +static void cs_hsi_data_write_error(struct cs_hsi_iface *hi, + struct hsi_msg *msg) +{ + __cs_hsi_error_pre(hi, msg, "data write", &hi->data_state); + __cs_hsi_error_write_bits(&hi->data_state); + __cs_hsi_error_post(hi); +} + +static void cs_hsi_read_on_control_complete(struct hsi_msg *msg) +{ + u32 cmd = cs_get_cmd(msg); + struct cs_hsi_iface *hi = msg->context; + + spin_lock(&hi->lock); + hi->control_state &= ~SSI_CHANNEL_STATE_READING; + if (msg->status == HSI_STATUS_ERROR) { + dev_err(&hi->cl->device, "Control RX error detected\n"); + cs_hsi_control_read_error(hi, msg); + spin_unlock(&hi->lock); + goto out; + } + dev_dbg(&hi->cl->device, "Read on control: %08X\n", cmd); + cs_release_cmd(msg); + if (hi->flags & CS_FEAT_TSTAMP_RX_CTRL) { + struct timespec *tstamp = + &hi->mmap_cfg->tstamp_rx_ctrl; + do_posix_clock_monotonic_gettime(tstamp); + } + spin_unlock(&hi->lock); + + cs_notify_control(cmd); + +out: + cs_hsi_read_on_control(hi); +} + +static void cs_hsi_peek_on_control_complete(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + int ret; + + if (msg->status == HSI_STATUS_ERROR) { + dev_err(&hi->cl->device, "Control peek RX error detected\n"); + cs_hsi_control_read_error(hi, msg); + return; + } + + WARN_ON(!(hi->control_state & SSI_CHANNEL_STATE_READING)); + + dev_dbg(&hi->cl->device, "Peek on control complete, reading\n"); + msg->sgt.nents = 1; + msg->complete = cs_hsi_read_on_control_complete; + ret = hsi_async_read(hi->cl, msg); + if (ret) + cs_hsi_control_read_error(hi, msg); +} + +static void cs_hsi_read_on_control(struct cs_hsi_iface *hi) +{ + struct hsi_msg *msg; + int ret; + + spin_lock(&hi->lock); + if (hi->control_state & SSI_CHANNEL_STATE_READING) { + dev_err(&hi->cl->device, "Control read already pending (%d)\n", + hi->control_state); + spin_unlock(&hi->lock); + return; + } + if (hi->control_state & SSI_CHANNEL_STATE_ERROR) { + dev_err(&hi->cl->device, "Control read error (%d)\n", + hi->control_state); + spin_unlock(&hi->lock); + return; + } + hi->control_state |= SSI_CHANNEL_STATE_READING; + dev_dbg(&hi->cl->device, "Issuing RX on control\n"); + msg = cs_claim_cmd(hi); + spin_unlock(&hi->lock); + + msg->sgt.nents = 0; + msg->complete = cs_hsi_peek_on_control_complete; + ret = hsi_async_read(hi->cl, msg); + if (ret) + cs_hsi_control_read_error(hi, msg); +} + +static void cs_hsi_write_on_control_complete(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + if (msg->status == HSI_STATUS_COMPLETED) { + spin_lock(&hi->lock); + hi->control_state &= ~SSI_CHANNEL_STATE_WRITING; + cs_release_cmd(msg); + spin_unlock(&hi->lock); + } else if (msg->status == HSI_STATUS_ERROR) { + cs_hsi_control_write_error(hi, msg); + } else { + dev_err(&hi->cl->device, + "unexpected status in control write callback %d\n", + msg->status); + } +} + +static int cs_hsi_write_on_control(struct cs_hsi_iface *hi, u32 message) +{ + struct hsi_msg *msg; + int ret; + + spin_lock(&hi->lock); + if (hi->control_state & SSI_CHANNEL_STATE_ERROR) { + spin_unlock(&hi->lock); + return -EIO; + } + if (hi->control_state & SSI_CHANNEL_STATE_WRITING) { + dev_err(&hi->cl->device, + "Write still pending on control channel.\n"); + spin_unlock(&hi->lock); + return -EBUSY; + } + hi->control_state |= SSI_CHANNEL_STATE_WRITING; + msg = cs_claim_cmd(hi); + spin_unlock(&hi->lock); + + cs_set_cmd(msg, message); + msg->sgt.nents = 1; + msg->complete = cs_hsi_write_on_control_complete; + dev_dbg(&hi->cl->device, + "Sending control message %08X\n", message); + ret = hsi_async_write(hi->cl, msg); + if (ret) { + dev_err(&hi->cl->device, + "async_write failed with %d\n", ret); + cs_hsi_control_write_error(hi, msg); + } + + /* + * Make sure control read is always pending when issuing + * new control writes. This is needed as the controller + * may flush our messages if e.g. the peer device reboots + * unexpectedly (and we cannot directly resubmit a new read from + * the message destructor; see cs_cmd_destructor()). + */ + if (!(hi->control_state & SSI_CHANNEL_STATE_READING)) { + dev_err(&hi->cl->device, "Restarting control reads\n"); + cs_hsi_read_on_control(hi); + } + + return 0; +} + +static void cs_hsi_read_on_data_complete(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + u32 payload; + + if (unlikely(msg->status == HSI_STATUS_ERROR)) { + cs_hsi_data_read_error(hi, msg); + return; + } + + spin_lock(&hi->lock); + WARN_ON(!(hi->data_state & SSI_CHANNEL_STATE_READING)); + hi->data_state &= ~SSI_CHANNEL_STATE_READING; + payload = CS_RX_DATA_RECEIVED; + payload |= hi->rx_slot; + hi->rx_slot++; + hi->rx_slot %= hi->rx_ptr_boundary; + /* expose current rx ptr in mmap area */ + hi->mmap_cfg->rx_ptr = hi->rx_slot; + if (unlikely(waitqueue_active(&hi->datawait))) + wake_up_interruptible(&hi->datawait); + spin_unlock(&hi->lock); + + cs_notify_data(payload, hi->rx_bufs); + cs_hsi_read_on_data(hi); +} + +static void cs_hsi_peek_on_data_complete(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + u32 *address; + int ret; + + if (unlikely(msg->status == HSI_STATUS_ERROR)) { + cs_hsi_data_read_error(hi, msg); + return; + } + if (unlikely(hi->iface_state != CS_STATE_CONFIGURED)) { + dev_err(&hi->cl->device, "Data received in invalid state\n"); + cs_hsi_data_read_error(hi, msg); + return; + } + + spin_lock(&hi->lock); + WARN_ON(!(hi->data_state & SSI_CHANNEL_STATE_POLL)); + hi->data_state &= ~SSI_CHANNEL_STATE_POLL; + hi->data_state |= SSI_CHANNEL_STATE_READING; + spin_unlock(&hi->lock); + + address = (u32 *)(hi->mmap_base + + hi->rx_offsets[hi->rx_slot % hi->rx_bufs]); + sg_init_one(msg->sgt.sgl, address, hi->buf_size); + msg->sgt.nents = 1; + msg->complete = cs_hsi_read_on_data_complete; + ret = hsi_async_read(hi->cl, msg); + if (ret) + cs_hsi_data_read_error(hi, msg); +} + +/* + * Read/write transaction is ongoing. Returns false if in + * SSI_CHANNEL_STATE_POLL state. + */ +static inline int cs_state_xfer_active(unsigned int state) +{ + return (state & SSI_CHANNEL_STATE_WRITING) || + (state & SSI_CHANNEL_STATE_READING); +} + +/* + * No pending read/writes + */ +static inline int cs_state_idle(unsigned int state) +{ + return !(state & ~SSI_CHANNEL_STATE_ERROR); +} + +static void cs_hsi_read_on_data(struct cs_hsi_iface *hi) +{ + struct hsi_msg *rxmsg; + int ret; + + spin_lock(&hi->lock); + if (hi->data_state & + (SSI_CHANNEL_STATE_READING | SSI_CHANNEL_STATE_POLL)) { + dev_dbg(&hi->cl->device, "Data read already pending (%u)\n", + hi->data_state); + spin_unlock(&hi->lock); + return; + } + hi->data_state |= SSI_CHANNEL_STATE_POLL; + spin_unlock(&hi->lock); + + rxmsg = hi->data_rx_msg; + sg_init_one(rxmsg->sgt.sgl, (void *)hi->mmap_base, 0); + rxmsg->sgt.nents = 0; + rxmsg->complete = cs_hsi_peek_on_data_complete; + + ret = hsi_async_read(hi->cl, rxmsg); + if (ret) + cs_hsi_data_read_error(hi, rxmsg); +} + +static void cs_hsi_write_on_data_complete(struct hsi_msg *msg) +{ + struct cs_hsi_iface *hi = msg->context; + + if (msg->status == HSI_STATUS_COMPLETED) { + spin_lock(&hi->lock); + hi->data_state &= ~SSI_CHANNEL_STATE_WRITING; + if (unlikely(waitqueue_active(&hi->datawait))) + wake_up_interruptible(&hi->datawait); + spin_unlock(&hi->lock); + } else { + cs_hsi_data_write_error(hi, msg); + } +} + +static int cs_hsi_write_on_data(struct cs_hsi_iface *hi, unsigned int slot) +{ + u32 *address; + struct hsi_msg *txmsg; + int ret; + + spin_lock(&hi->lock); + if (hi->iface_state != CS_STATE_CONFIGURED) { + dev_err(&hi->cl->device, "Not configured, aborting\n"); + ret = -EINVAL; + goto error; + } + if (hi->data_state & SSI_CHANNEL_STATE_ERROR) { + dev_err(&hi->cl->device, "HSI error, aborting\n"); + ret = -EIO; + goto error; + } + if (hi->data_state & SSI_CHANNEL_STATE_WRITING) { + dev_err(&hi->cl->device, "Write pending on data channel.\n"); + ret = -EBUSY; + goto error; + } + hi->data_state |= SSI_CHANNEL_STATE_WRITING; + spin_unlock(&hi->lock); + + hi->tx_slot = slot; + address = (u32 *)(hi->mmap_base + hi->tx_offsets[hi->tx_slot]); + txmsg = hi->data_tx_msg; + sg_init_one(txmsg->sgt.sgl, address, hi->buf_size); + txmsg->complete = cs_hsi_write_on_data_complete; + ret = hsi_async_write(hi->cl, txmsg); + if (ret) + cs_hsi_data_write_error(hi, txmsg); + + return ret; + +error: + spin_unlock(&hi->lock); + if (ret == -EIO) + cs_hsi_data_write_error(hi, hi->data_tx_msg); + + return ret; +} + +static unsigned int cs_hsi_get_state(struct cs_hsi_iface *hi) +{ + return hi->iface_state; +} + +static int cs_hsi_command(struct cs_hsi_iface *hi, u32 cmd) +{ + int ret = 0; + + local_bh_disable(); + switch (cmd & TARGET_MASK) { + case TARGET_REMOTE: + ret = cs_hsi_write_on_control(hi, cmd); + break; + case TARGET_LOCAL: + if ((cmd & CS_CMD_MASK) == CS_TX_DATA_READY) + ret = cs_hsi_write_on_data(hi, cmd & CS_PARAM_MASK); + else + ret = -EINVAL; + break; + default: + ret = -EINVAL; + break; + } + local_bh_enable(); + + return ret; +} + +static void cs_hsi_set_wakeline(struct cs_hsi_iface *hi, bool new_state) +{ + int change = 0; + + spin_lock_bh(&hi->lock); + if (hi->wakeline_state != new_state) { + hi->wakeline_state = new_state; + change = 1; + dev_dbg(&hi->cl->device, "setting wake line to %d (%p)\n", + new_state, hi->cl); + } + spin_unlock_bh(&hi->lock); + + if (change) { + if (new_state) + ssip_slave_start_tx(hi->master); + else + ssip_slave_stop_tx(hi->master); + } + + dev_dbg(&hi->cl->device, "wake line set to %d (%p)\n", + new_state, hi->cl); +} + +static void set_buffer_sizes(struct cs_hsi_iface *hi, int rx_bufs, int tx_bufs) +{ + hi->rx_bufs = rx_bufs; + hi->tx_bufs = tx_bufs; + hi->mmap_cfg->rx_bufs = rx_bufs; + hi->mmap_cfg->tx_bufs = tx_bufs; + + if (hi->flags & CS_FEAT_ROLLING_RX_COUNTER) { + /* + * For more robust overrun detection, let the rx + * pointer run in range 0..'boundary-1'. Boundary + * is a multiple of rx_bufs, and limited in max size + * by RX_PTR_MAX_SHIFT to allow for fast ptr-diff + * calculation. + */ + hi->rx_ptr_boundary = (rx_bufs << RX_PTR_BOUNDARY_SHIFT); + hi->mmap_cfg->rx_ptr_boundary = hi->rx_ptr_boundary; + } else { + hi->rx_ptr_boundary = hi->rx_bufs; + } +} + +static int check_buf_params(struct cs_hsi_iface *hi, + const struct cs_buffer_config *buf_cfg) +{ + size_t buf_size_aligned = L1_CACHE_ALIGN(buf_cfg->buf_size) * + (buf_cfg->rx_bufs + buf_cfg->tx_bufs); + size_t ctrl_size_aligned = L1_CACHE_ALIGN(sizeof(*hi->mmap_cfg)); + int r = 0; + + if (buf_cfg->rx_bufs > CS_MAX_BUFFERS || + buf_cfg->tx_bufs > CS_MAX_BUFFERS) { + r = -EINVAL; + } else if ((buf_size_aligned + ctrl_size_aligned) >= hi->mmap_size) { + dev_err(&hi->cl->device, "No space for the requested buffer " + "configuration\n"); + r = -ENOBUFS; + } + + return r; +} + +/** + * Block until pending data transfers have completed. + */ +static int cs_hsi_data_sync(struct cs_hsi_iface *hi) +{ + int r = 0; + + spin_lock_bh(&hi->lock); + + if (!cs_state_xfer_active(hi->data_state)) { + dev_dbg(&hi->cl->device, "hsi_data_sync break, idle\n"); + goto out; + } + + for (;;) { + int s; + DEFINE_WAIT(wait); + if (!cs_state_xfer_active(hi->data_state)) + goto out; + if (signal_pending(current)) { + r = -ERESTARTSYS; + goto out; + } + /** + * prepare_to_wait must be called with hi->lock held + * so that callbacks can check for waitqueue_active() + */ + prepare_to_wait(&hi->datawait, &wait, TASK_INTERRUPTIBLE); + spin_unlock_bh(&hi->lock); + s = schedule_timeout( + msecs_to_jiffies(CS_HSI_TRANSFER_TIMEOUT_MS)); + spin_lock_bh(&hi->lock); + finish_wait(&hi->datawait, &wait); + if (!s) { + dev_dbg(&hi->cl->device, + "hsi_data_sync timeout after %d ms\n", + CS_HSI_TRANSFER_TIMEOUT_MS); + r = -EIO; + goto out; + } + } + +out: + spin_unlock_bh(&hi->lock); + dev_dbg(&hi->cl->device, "hsi_data_sync done with res %d\n", r); + + return r; +} + +static void cs_hsi_data_enable(struct cs_hsi_iface *hi, + struct cs_buffer_config *buf_cfg) +{ + unsigned int data_start, i; + + BUG_ON(hi->buf_size == 0); + + set_buffer_sizes(hi, buf_cfg->rx_bufs, buf_cfg->tx_bufs); + + hi->slot_size = L1_CACHE_ALIGN(hi->buf_size); + dev_dbg(&hi->cl->device, + "setting slot size to %u, buf size %u, align %u\n", + hi->slot_size, hi->buf_size, L1_CACHE_BYTES); + + data_start = L1_CACHE_ALIGN(sizeof(*hi->mmap_cfg)); + dev_dbg(&hi->cl->device, + "setting data start at %u, cfg block %u, align %u\n", + data_start, sizeof(*hi->mmap_cfg), L1_CACHE_BYTES); + + for (i = 0; i < hi->mmap_cfg->rx_bufs; i++) { + hi->rx_offsets[i] = data_start + i * hi->slot_size; + hi->mmap_cfg->rx_offsets[i] = hi->rx_offsets[i]; + dev_dbg(&hi->cl->device, "DL buf #%u at %u\n", + i, hi->rx_offsets[i]); + } + for (i = 0; i < hi->mmap_cfg->tx_bufs; i++) { + hi->tx_offsets[i] = data_start + + (i + hi->mmap_cfg->rx_bufs) * hi->slot_size; + hi->mmap_cfg->tx_offsets[i] = hi->tx_offsets[i]; + dev_dbg(&hi->cl->device, "UL buf #%u at %u\n", + i, hi->rx_offsets[i]); + } + + hi->iface_state = CS_STATE_CONFIGURED; +} + +static void cs_hsi_data_disable(struct cs_hsi_iface *hi, int old_state) +{ + if (old_state == CS_STATE_CONFIGURED) { + dev_dbg(&hi->cl->device, + "closing data channel with slot size 0\n"); + hi->iface_state = CS_STATE_OPENED; + } +} + +static int cs_hsi_buf_config(struct cs_hsi_iface *hi, + struct cs_buffer_config *buf_cfg) +{ + int r = 0; + unsigned int old_state = hi->iface_state; + + spin_lock_bh(&hi->lock); + /* Prevent new transactions during buffer reconfig */ + if (old_state == CS_STATE_CONFIGURED) + hi->iface_state = CS_STATE_OPENED; + spin_unlock_bh(&hi->lock); + + /* + * make sure that no non-zero data reads are ongoing before + * proceeding to change the buffer layout + */ + r = cs_hsi_data_sync(hi); + if (r < 0) + return r; + + WARN_ON(cs_state_xfer_active(hi->data_state)); + + spin_lock_bh(&hi->lock); + r = check_buf_params(hi, buf_cfg); + if (r < 0) + goto error; + + hi->buf_size = buf_cfg->buf_size; + hi->mmap_cfg->buf_size = hi->buf_size; + hi->flags = buf_cfg->flags; + + hi->rx_slot = 0; + hi->tx_slot = 0; + hi->slot_size = 0; + + if (hi->buf_size) + cs_hsi_data_enable(hi, buf_cfg); + else + cs_hsi_data_disable(hi, old_state); + + spin_unlock_bh(&hi->lock); + + if (old_state != hi->iface_state) { + if (hi->iface_state == CS_STATE_CONFIGURED) { + pm_qos_add_request(&hi->pm_qos_req, + PM_QOS_CPU_DMA_LATENCY, + CS_QOS_LATENCY_FOR_DATA_USEC); + local_bh_disable(); + cs_hsi_read_on_data(hi); + local_bh_enable(); + } else if (old_state == CS_STATE_CONFIGURED) { + pm_qos_remove_request(&hi->pm_qos_req); + } + } + return r; + +error: + spin_unlock_bh(&hi->lock); + return r; +} + +static int cs_hsi_start(struct cs_hsi_iface **hi, struct hsi_client *cl, + unsigned long mmap_base, unsigned long mmap_size) +{ + int err = 0; + struct cs_hsi_iface *hsi_if = kzalloc(sizeof(*hsi_if), GFP_KERNEL); + + dev_dbg(&cl->device, "cs_hsi_start\n"); + + if (!hsi_if) { + err = -ENOMEM; + goto leave0; + } + spin_lock_init(&hsi_if->lock); + hsi_if->cl = cl; + hsi_if->iface_state = CS_STATE_CLOSED; + hsi_if->mmap_cfg = (struct cs_mmap_config_block *)mmap_base; + hsi_if->mmap_base = mmap_base; + hsi_if->mmap_size = mmap_size; + memset(hsi_if->mmap_cfg, 0, sizeof(*hsi_if->mmap_cfg)); + init_waitqueue_head(&hsi_if->datawait); + err = cs_alloc_cmds(hsi_if); + if (err < 0) { + dev_err(&cl->device, "Unable to alloc HSI messages\n"); + goto leave1; + } + err = cs_hsi_alloc_data(hsi_if); + if (err < 0) { + dev_err(&cl->device, "Unable to alloc HSI messages for data\n"); + goto leave2; + } + err = hsi_claim_port(cl, 1); + if (err < 0) { + dev_err(&cl->device, + "Could not open, HSI port already claimed\n"); + goto leave3; + } + hsi_if->master = ssip_slave_get_master(cl); + if (IS_ERR(hsi_if->master)) { + dev_err(&cl->device, "Could not get HSI master client\n"); + goto leave4; + } + if (!ssip_slave_running(hsi_if->master)) { + err = -ENODEV; + dev_err(&cl->device, + "HSI port not initialized\n"); + goto leave4; + } + + hsi_if->iface_state = CS_STATE_OPENED; + local_bh_disable(); + cs_hsi_read_on_control(hsi_if); + local_bh_enable(); + + dev_dbg(&cl->device, "cs_hsi_start...done\n"); + + BUG_ON(!hi); + *hi = hsi_if; + + return 0; + +leave4: + hsi_release_port(cl); +leave3: + cs_hsi_free_data(hsi_if); +leave2: + cs_free_cmds(hsi_if); +leave1: + kfree(hsi_if); +leave0: + dev_dbg(&cl->device, "cs_hsi_start...done/error\n\n"); + + return err; +} + +static void cs_hsi_stop(struct cs_hsi_iface *hi) +{ + dev_dbg(&hi->cl->device, "cs_hsi_stop\n"); + cs_hsi_set_wakeline(hi, 0); + ssip_slave_put_master(hi->master); + + /* hsi_release_port() needs to be called with CS_STATE_CLOSED */ + hi->iface_state = CS_STATE_CLOSED; + hsi_release_port(hi->cl); + + /* + * hsi_release_port() should flush out all the pending + * messages, so cs_state_idle() should be true for both + * control and data channels. + */ + WARN_ON(!cs_state_idle(hi->control_state)); + WARN_ON(!cs_state_idle(hi->data_state)); + + if (pm_qos_request_active(&hi->pm_qos_req)) + pm_qos_remove_request(&hi->pm_qos_req); + + spin_lock_bh(&hi->lock); + cs_hsi_free_data(hi); + cs_free_cmds(hi); + spin_unlock_bh(&hi->lock); + kfree(hi); +} + +static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct cs_char *csdata = vma->vm_private_data; + struct page *page; + + page = virt_to_page(csdata->mmap_base); + get_page(page); + vmf->page = page; + + return 0; +} + +static struct vm_operations_struct cs_char_vm_ops = { + .fault = cs_char_vma_fault, +}; + +static int cs_char_fasync(int fd, struct file *file, int on) +{ + struct cs_char *csdata = file->private_data; + + if (fasync_helper(fd, file, on, &csdata->async_queue) < 0) + return -EIO; + + return 0; +} + +static unsigned int cs_char_poll(struct file *file, poll_table *wait) +{ + struct cs_char *csdata = file->private_data; + unsigned int ret = 0; + + poll_wait(file, &cs_char_data.wait, wait); + spin_lock_bh(&csdata->lock); + if (!list_empty(&csdata->chardev_queue)) + ret = POLLIN | POLLRDNORM; + else if (!list_empty(&csdata->dataind_queue)) + ret = POLLIN | POLLRDNORM; + spin_unlock_bh(&csdata->lock); + + return ret; +} + +static ssize_t cs_char_read(struct file *file, char __user *buf, size_t count, + loff_t *unused) +{ + struct cs_char *csdata = file->private_data; + u32 data; + ssize_t retval; + + if (count < sizeof(data)) + return -EINVAL; + + for (;;) { + DEFINE_WAIT(wait); + + spin_lock_bh(&csdata->lock); + if (!list_empty(&csdata->chardev_queue)) { + data = cs_pop_entry(&csdata->chardev_queue); + } else if (!list_empty(&csdata->dataind_queue)) { + data = cs_pop_entry(&csdata->dataind_queue); + csdata->dataind_pending--; + } else { + data = 0; + } + spin_unlock_bh(&csdata->lock); + + if (data) + break; + if (file->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + goto out; + } else if (signal_pending(current)) { + retval = -ERESTARTSYS; + goto out; + } + prepare_to_wait_exclusive(&csdata->wait, &wait, + TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&csdata->wait, &wait); + } + + retval = put_user(data, (u32 __user *)buf); + if (!retval) + retval = sizeof(data); + +out: + return retval; +} + +static ssize_t cs_char_write(struct file *file, const char __user *buf, + size_t count, loff_t *unused) +{ + struct cs_char *csdata = file->private_data; + u32 data; + int err; + ssize_t retval; + + if (count < sizeof(data)) + return -EINVAL; + + if (get_user(data, (u32 __user *)buf)) + retval = -EFAULT; + else + retval = count; + + err = cs_hsi_command(csdata->hi, data); + if (err < 0) + retval = err; + + return retval; +} + +static long cs_char_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cs_char *csdata = file->private_data; + int r = 0; + + switch (cmd) { + case CS_GET_STATE: { + unsigned int state; + + state = cs_hsi_get_state(csdata->hi); + if (copy_to_user((void __user *)arg, &state, sizeof(state))) + r = -EFAULT; + + break; + } + case CS_SET_WAKELINE: { + unsigned int state; + + if (copy_from_user(&state, (void __user *)arg, sizeof(state))) { + r = -EFAULT; + break; + } + + if (state > 1) { + r = -EINVAL; + break; + } + + cs_hsi_set_wakeline(csdata->hi, !!state); + + break; + } + case CS_GET_IF_VERSION: { + unsigned int ifver = CS_IF_VERSION; + + if (copy_to_user((void __user *)arg, &ifver, sizeof(ifver))) + r = -EFAULT; + + break; + } + case CS_CONFIG_BUFS: { + struct cs_buffer_config buf_cfg; + + if (copy_from_user(&buf_cfg, (void __user *)arg, + sizeof(buf_cfg))) + r = -EFAULT; + else + r = cs_hsi_buf_config(csdata->hi, &buf_cfg); + + break; + } + default: + r = -ENOTTY; + break; + } + + return r; +} + +static int cs_char_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (vma->vm_end < vma->vm_start) + return -EINVAL; + + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) != 1) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND; + vma->vm_ops = &cs_char_vm_ops; + vma->vm_private_data = file->private_data; + + return 0; +} + +static int cs_char_open(struct inode *unused, struct file *file) +{ + int ret = 0; + unsigned long p; + + spin_lock_bh(&cs_char_data.lock); + if (cs_char_data.opened) { + ret = -EBUSY; + spin_unlock_bh(&cs_char_data.lock); + goto out1; + } + cs_char_data.opened = 1; + cs_char_data.dataind_pending = 0; + spin_unlock_bh(&cs_char_data.lock); + + p = get_zeroed_page(GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto out2; + } + + ret = cs_hsi_start(&cs_char_data.hi, cs_char_data.cl, p, CS_MMAP_SIZE); + if (ret) { + dev_err(&cs_char_data.cl->device, "Unable to initialize HSI\n"); + goto out3; + } + + /* these are only used in release so lock not needed */ + cs_char_data.mmap_base = p; + cs_char_data.mmap_size = CS_MMAP_SIZE; + + file->private_data = &cs_char_data; + + return 0; + +out3: + free_page(p); +out2: + spin_lock_bh(&cs_char_data.lock); + cs_char_data.opened = 0; + spin_unlock_bh(&cs_char_data.lock); +out1: + return ret; +} + +static void cs_free_char_queue(struct list_head *head) +{ + struct char_queue *entry; + struct list_head *cursor, *next; + + if (!list_empty(head)) { + list_for_each_safe(cursor, next, head) { + entry = list_entry(cursor, struct char_queue, list); + list_del(&entry->list); + kfree(entry); + } + } + +} + +static int cs_char_release(struct inode *unused, struct file *file) +{ + struct cs_char *csdata = file->private_data; + + cs_hsi_stop(csdata->hi); + spin_lock_bh(&csdata->lock); + csdata->hi = NULL; + free_page(csdata->mmap_base); + cs_free_char_queue(&csdata->chardev_queue); + cs_free_char_queue(&csdata->dataind_queue); + csdata->opened = 0; + spin_unlock_bh(&csdata->lock); + + return 0; +} + +static const struct file_operations cs_char_fops = { + .owner = THIS_MODULE, + .read = cs_char_read, + .write = cs_char_write, + .poll = cs_char_poll, + .unlocked_ioctl = cs_char_ioctl, + .mmap = cs_char_mmap, + .open = cs_char_open, + .release = cs_char_release, + .fasync = cs_char_fasync, +}; + +static struct miscdevice cs_char_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cmt_speech", + .fops = &cs_char_fops +}; + +static int cs_hsi_client_probe(struct device *dev) +{ + int err = 0; + struct hsi_client *cl = to_hsi_client(dev); + + dev_dbg(dev, "hsi_client_probe\n"); + init_waitqueue_head(&cs_char_data.wait); + spin_lock_init(&cs_char_data.lock); + cs_char_data.opened = 0; + cs_char_data.cl = cl; + cs_char_data.hi = NULL; + INIT_LIST_HEAD(&cs_char_data.chardev_queue); + INIT_LIST_HEAD(&cs_char_data.dataind_queue); + + cs_char_data.channel_id_cmd = hsi_get_channel_id_by_name(cl, + "speech-control"); + if (cs_char_data.channel_id_cmd < 0) { + err = cs_char_data.channel_id_cmd; + dev_err(dev, "Could not get cmd channel (%d)\n", err); + return err; + } + + cs_char_data.channel_id_data = hsi_get_channel_id_by_name(cl, + "speech-data"); + if (cs_char_data.channel_id_data < 0) { + err = cs_char_data.channel_id_data; + dev_err(dev, "Could not get data channel (%d)\n", err); + return err; + } + + err = misc_register(&cs_char_miscdev); + if (err) + dev_err(dev, "Failed to register: %d\n", err); + + return err; +} + +static int cs_hsi_client_remove(struct device *dev) +{ + struct cs_hsi_iface *hi; + + dev_dbg(dev, "hsi_client_remove\n"); + misc_deregister(&cs_char_miscdev); + spin_lock_bh(&cs_char_data.lock); + hi = cs_char_data.hi; + cs_char_data.hi = NULL; + spin_unlock_bh(&cs_char_data.lock); + if (hi) + cs_hsi_stop(hi); + + return 0; +} + +static struct hsi_client_driver cs_hsi_driver = { + .driver = { + .name = "cmt-speech", + .owner = THIS_MODULE, + .probe = cs_hsi_client_probe, + .remove = cs_hsi_client_remove, + }, +}; + +static int __init cs_char_init(void) +{ + pr_info("CMT speech driver added\n"); + return hsi_register_client_driver(&cs_hsi_driver); +} +module_init(cs_char_init); + +static void __exit cs_char_exit(void) +{ + hsi_unregister_client_driver(&cs_hsi_driver); + pr_info("CMT speech driver removed\n"); +} +module_exit(cs_char_exit); + +MODULE_ALIAS("hsi:cmt-speech"); +MODULE_AUTHOR("Kai Vehmanen "); +MODULE_AUTHOR("Peter Ujfalusi "); +MODULE_DESCRIPTION("CMT speech driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/uapi/linux/hsi/Kbuild b/include/uapi/linux/hsi/Kbuild index 30ab3cd3b8a5..a16a00544258 100644 --- a/include/uapi/linux/hsi/Kbuild +++ b/include/uapi/linux/hsi/Kbuild @@ -1,2 +1,2 @@ # UAPI Header export list -header-y += hsi_char.h +header-y += hsi_char.h cs-protocol.h diff --git a/include/uapi/linux/hsi/cs-protocol.h b/include/uapi/linux/hsi/cs-protocol.h new file mode 100644 index 000000000000..4957bba57cbe --- /dev/null +++ b/include/uapi/linux/hsi/cs-protocol.h @@ -0,0 +1,113 @@ +/* + * cmt-speech interface definitions + * + * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved. + * + * Contact: Kai Vehmanen + * Original author: Peter Ujfalusi + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#ifndef _CS_PROTOCOL_H +#define _CS_PROTOCOL_H + +#include +#include + +/* chardev parameters */ +#define CS_DEV_FILE_NAME "/dev/cmt_speech" + +/* user-space API versioning */ +#define CS_IF_VERSION 2 + +/* APE kernel <-> user space messages */ +#define CS_CMD_SHIFT 28 +#define CS_DOMAIN_SHIFT 24 + +#define CS_CMD_MASK 0xff000000 +#define CS_PARAM_MASK 0xffffff + +#define CS_CMD(id, dom) \ + (((id) << CS_CMD_SHIFT) | ((dom) << CS_DOMAIN_SHIFT)) + +#define CS_ERROR CS_CMD(1, 0) +#define CS_RX_DATA_RECEIVED CS_CMD(2, 0) +#define CS_TX_DATA_READY CS_CMD(3, 0) +#define CS_TX_DATA_SENT CS_CMD(4, 0) + +/* params to CS_ERROR indication */ +#define CS_ERR_PEER_RESET 0 + +/* ioctl interface */ + +/* parameters to CS_CONFIG_BUFS ioctl */ +#define CS_FEAT_TSTAMP_RX_CTRL (1 << 0) +#define CS_FEAT_ROLLING_RX_COUNTER (2 << 0) + +/* parameters to CS_GET_STATE ioctl */ +#define CS_STATE_CLOSED 0 +#define CS_STATE_OPENED 1 /* resource allocated */ +#define CS_STATE_CONFIGURED 2 /* data path active */ + +/* maximum number of TX/RX buffers */ +#define CS_MAX_BUFFERS_SHIFT 4 +#define CS_MAX_BUFFERS (1 << CS_MAX_BUFFERS_SHIFT) + +/* Parameters for setting up the data buffers */ +struct cs_buffer_config { + __u32 rx_bufs; /* number of RX buffer slots */ + __u32 tx_bufs; /* number of TX buffer slots */ + __u32 buf_size; /* bytes */ + __u32 flags; /* see CS_FEAT_* */ + __u32 reserved[4]; +}; + +/* + * Struct describing the layout and contents of the driver mmap area. + * This information is meant as read-only information for the application. + */ +struct cs_mmap_config_block { + __u32 reserved1; + __u32 buf_size; /* 0=disabled, otherwise the transfer size */ + __u32 rx_bufs; /* # of RX buffers */ + __u32 tx_bufs; /* # of TX buffers */ + __u32 reserved2; + /* array of offsets within the mmap area for each RX and TX buffer */ + __u32 rx_offsets[CS_MAX_BUFFERS]; + __u32 tx_offsets[CS_MAX_BUFFERS]; + __u32 rx_ptr; + __u32 rx_ptr_boundary; + __u32 reserved3[2]; + /* + * if enabled with CS_FEAT_TSTAMP_RX_CTRL, monotonic + * timestamp taken when the last control command was received + */ + struct timespec tstamp_rx_ctrl; +}; + +#define CS_IO_MAGIC 'C' + +#define CS_IOW(num, dtype) _IOW(CS_IO_MAGIC, num, dtype) +#define CS_IOR(num, dtype) _IOR(CS_IO_MAGIC, num, dtype) +#define CS_IOWR(num, dtype) _IOWR(CS_IO_MAGIC, num, dtype) +#define CS_IO(num) _IO(CS_IO_MAGIC, num) + +#define CS_GET_STATE CS_IOR(21, unsigned int) +#define CS_SET_WAKELINE CS_IOW(23, unsigned int) +#define CS_GET_IF_VERSION CS_IOR(30, unsigned int) +#define CS_CONFIG_BUFS CS_IOW(31, struct cs_buffer_config) + +#endif /* _CS_PROTOCOL_H */ -- cgit v1.2.3 From 761da2935d6e18d178582dbdf315a3a458555505 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 26 Mar 2015 12:39:36 +0000 Subject: netfilter: nf_tables: add set timeout API support Add set timeout support to the netlink API. Sets with timeout support enabled can have a default timeout value and garbage collection interval specified. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 +++++++++ include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b8cd60dcb4e1..8936803a2ad5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -258,6 +258,8 @@ void nft_unregister_set(struct nft_set_ops *ops); * @dtype: data type (verdict or numeric type defined by userspace) * @size: maximum set size * @nelems: number of elements + * @timeout: default timeout value in msecs + * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @ops: set ops * @pnet: network namespace @@ -274,6 +276,8 @@ struct nft_set { u32 dtype; u32 size; u32 nelems; + u64 timeout; + u32 gc_int; u16 policy; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; @@ -295,6 +299,11 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla); +static inline unsigned long nft_set_gc_interval(const struct nft_set *set) +{ + return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ; +} + /** * struct nft_set_binding - nf_tables set binding * diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b9783931503b..971d245e7378 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -208,12 +208,14 @@ enum nft_rule_compat_attributes { * @NFT_SET_CONSTANT: set contents may not change while bound * @NFT_SET_INTERVAL: set contains intervals * @NFT_SET_MAP: set is used as a dictionary + * @NFT_SET_TIMEOUT: set uses timeouts */ enum nft_set_flags { NFT_SET_ANONYMOUS = 0x1, NFT_SET_CONSTANT = 0x2, NFT_SET_INTERVAL = 0x4, NFT_SET_MAP = 0x8, + NFT_SET_TIMEOUT = 0x10, }; /** @@ -252,6 +254,8 @@ enum nft_set_desc_attributes { * @NFTA_SET_POLICY: selection policy (NLA_U32) * @NFTA_SET_DESC: set description (NLA_NESTED) * @NFTA_SET_ID: uniquely identifies a set in a transaction (NLA_U32) + * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64) + * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -265,6 +269,8 @@ enum nft_set_attributes { NFTA_SET_POLICY, NFTA_SET_DESC, NFTA_SET_ID, + NFTA_SET_TIMEOUT, + NFTA_SET_GC_INTERVAL, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5604c2df05d1..6320b64e773e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2216,6 +2216,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_POLICY] = { .type = NLA_U32 }, [NFTA_SET_DESC] = { .type = NLA_NESTED }, [NFTA_SET_ID] = { .type = NLA_U32 }, + [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2366,6 +2368,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->timeout && + nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout))) + goto nla_put_failure; + if (set->gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + goto nla_put_failure; + if (set->policy != NFT_SET_POL_PERFORMANCE) { if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) goto nla_put_failure; @@ -2578,7 +2587,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, char name[IFNAMSIZ]; unsigned int size; bool create; - u32 ktype, dtype, flags, policy; + u64 timeout; + u32 ktype, dtype, flags, policy, gc_int; struct nft_set_desc desc; int err; @@ -2605,7 +2615,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | - NFT_SET_INTERVAL | NFT_SET_MAP)) + NFT_SET_INTERVAL | NFT_SET_MAP | + NFT_SET_TIMEOUT)) return -EINVAL; } @@ -2631,6 +2642,19 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } else if (flags & NFT_SET_MAP) return -EINVAL; + timeout = 0; + if (nla[NFTA_SET_TIMEOUT] != NULL) { + if (!(flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT])); + } + gc_int = 0; + if (nla[NFTA_SET_GC_INTERVAL] != NULL) { + if (!(flags & NFT_SET_TIMEOUT)) + return -EINVAL; + gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + } + policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); @@ -2699,6 +2723,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->flags = flags; set->size = desc.size; set->policy = policy; + set->timeout = timeout; + set->gc_int = gc_int; err = ops->init(set, &desc, nla); if (err < 0) -- cgit v1.2.3 From c3e1b005ed1cc068fc9d454a6e745830d55d251d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 26 Mar 2015 12:39:37 +0000 Subject: netfilter: nf_tables: add set element timeout support Add API support for set element timeouts. Elements can have a individual timeout value specified, overriding the sets' default. Two new extension types are used for timeouts - the timeout value and the expiration time. The timeout value only exists if it differs from the default value. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 20 ++++++++++++ include/uapi/linux/netfilter/nf_tables.h | 4 +++ net/netfilter/nf_tables_api.c | 53 ++++++++++++++++++++++++++++++-- 3 files changed, 75 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8936803a2ad5..f2726c537248 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -329,12 +329,16 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags + * @NFT_SET_EXT_TIMEOUT: element timeout + * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, + NFT_SET_EXT_TIMEOUT, + NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_NUM }; @@ -431,6 +435,22 @@ static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } +static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); +} + +static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); +} + +static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +{ + return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && + time_is_before_eq_jiffies(*nft_set_ext_expiration(ext)); +} + static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, void *elem) { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 971d245e7378..83441cc4594b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -290,12 +290,16 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_KEY: key value (NLA_NESTED: nft_data) * @NFTA_SET_ELEM_DATA: data value of mapping (NLA_NESTED: nft_data_attributes) * @NFTA_SET_ELEM_FLAGS: bitmask of nft_set_elem_flags (NLA_U32) + * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) + * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, NFTA_SET_ELEM_KEY, NFTA_SET_ELEM_DATA, NFTA_SET_ELEM_FLAGS, + NFTA_SET_ELEM_TIMEOUT, + NFTA_SET_ELEM_EXPIRATION, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6320b64e773e..9e032dbc149c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2863,6 +2863,14 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(u8), .align = __alignof__(u8), }, + [NFT_SET_EXT_TIMEOUT] = { + .len = sizeof(u64), + .align = __alignof__(u64), + }, + [NFT_SET_EXT_EXPIRATION] = { + .len = sizeof(unsigned long), + .align = __alignof__(unsigned long), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -2874,6 +2882,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, + [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -2935,6 +2944,25 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + cpu_to_be64(*nft_set_ext_timeout(ext)))) + goto nla_put_failure; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + unsigned long expires, now = jiffies; + + expires = *nft_set_ext_expiration(ext); + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + cpu_to_be64(jiffies_to_msecs(expires)))) + goto nla_put_failure; + } + nla_nest_end(skb, nest); return 0; @@ -3158,7 +3186,7 @@ static void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const struct nft_data *key, const struct nft_data *data, - gfp_t gfp) + u64 timeout, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -3173,6 +3201,11 @@ static void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) + *nft_set_ext_expiration(ext) = + jiffies + msecs_to_jiffies(timeout); + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) + *nft_set_ext_timeout(ext) = timeout; return elem; } @@ -3201,6 +3234,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u64 timeout; u32 flags; int err; @@ -3241,6 +3275,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return -EINVAL; } + timeout = 0; + if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT])); + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = set->timeout; + } + err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3249,6 +3292,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err2; nft_set_ext_add(&tmpl, NFT_SET_EXT_KEY); + if (timeout > 0) { + nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); + if (timeout != set->timeout) + nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); + } if (nla[NFTA_SET_ELEM_DATA] != NULL) { err = nft_data_init(ctx, &data, &d2, nla[NFTA_SET_ELEM_DATA]); @@ -3277,7 +3325,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, + timeout, GFP_KERNEL); if (elem.priv == NULL) goto err3; -- cgit v1.2.3 From a5581ef4c2eac6449188862e903eb46c7233582a Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 1 Apr 2015 07:50:29 +0200 Subject: can: introduce new raw socket option to join the given CAN filters The CAN_RAW socket can set multiple CAN identifier specific filters that lead to multiple filters in the af_can.c filter processing. These filters are indenpendent from each other which leads to logical OR'ed filters when applied. This socket option joines the given CAN filters in the way that only CAN frames are passed to user space that matched *all* given CAN filters. The semantic for the applied filters is therefore changed to a logical AND. This is useful especially when the filterset is a combination of filters where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or CAN ID ranges from the incoming traffic. As the raw_rcv() function is executed from NET_RX softirq the introduced variables are implemented as per-CPU variables to avoid extensive locking at CAN frame reception time. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- Documentation/networking/can.txt | 20 ++++++++++++++++++-- include/uapi/linux/can/raw.h | 1 + net/can/raw.c | 31 ++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt index 0a2859a8ee7e..5abad1e921ca 100644 --- a/Documentation/networking/can.txt +++ b/Documentation/networking/can.txt @@ -22,7 +22,8 @@ This file contains 4.1.3 RAW socket option CAN_RAW_LOOPBACK 4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS 4.1.5 RAW socket option CAN_RAW_FD_FRAMES - 4.1.6 RAW socket returned message flags + 4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS + 4.1.7 RAW socket returned message flags 4.2 Broadcast Manager protocol sockets (SOCK_DGRAM) 4.2.1 Broadcast Manager operations 4.2.2 Broadcast Manager message flags @@ -601,7 +602,22 @@ solution for a couple of reasons: CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU. The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall. - 4.1.6 RAW socket returned message flags + 4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS + + The CAN_RAW socket can set multiple CAN identifier specific filters that + lead to multiple filters in the af_can.c filter processing. These filters + are indenpendent from each other which leads to logical OR'ed filters when + applied (see 4.1.1). + + This socket option joines the given CAN filters in the way that only CAN + frames are passed to user space that matched *all* given CAN filters. The + semantic for the applied filters is therefore changed to a logical AND. + + This is useful especially when the filterset is a combination of filters + where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or + CAN ID ranges from the incoming traffic. + + 4.1.7 RAW socket returned message flags When using recvmsg() call, the msg->msg_flags may contain following flags: diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index 78ec76fd89a6..8735f1080385 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -57,6 +57,7 @@ enum { CAN_RAW_LOOPBACK, /* local loopback (default:on) */ CAN_RAW_RECV_OWN_MSGS, /* receive my own msgs (default:off) */ CAN_RAW_FD_FRAMES, /* allow CAN FD frames (default:off) */ + CAN_RAW_JOIN_FILTERS, /* all filters must match to trigger */ }; #endif /* !_UAPI_CAN_RAW_H */ diff --git a/net/can/raw.c b/net/can/raw.c index 0c8d537b59b8..31b9748cbb4e 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -77,6 +77,7 @@ MODULE_ALIAS("can-proto-1"); struct uniqframe { ktime_t tstamp; const struct sk_buff *skb; + unsigned int join_rx_count; }; struct raw_sock { @@ -87,6 +88,7 @@ struct raw_sock { int loopback; int recv_own_msgs; int fd_frames; + int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ @@ -132,10 +134,21 @@ static void raw_rcv(struct sk_buff *oskb, void *data) /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && ktime_equal(this_cpu_ptr(ro->uniq)->tstamp, oskb->tstamp)) { - return; + if (ro->join_filters) { + this_cpu_inc(ro->uniq->join_rx_count); + /* drop frame until all enabled filters matched */ + if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) + return; + } else { + return; + } } else { this_cpu_ptr(ro->uniq)->skb = oskb; this_cpu_ptr(ro->uniq)->tstamp = oskb->tstamp; + this_cpu_ptr(ro->uniq)->join_rx_count = 1; + /* drop first frame to check all enabled filters? */ + if (ro->join_filters && ro->count > 1) + return; } /* clone the given skb to be able to enqueue it into the rcv queue */ @@ -311,6 +324,7 @@ static int raw_init(struct sock *sk) ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; + ro->join_filters = 0; /* alloc_percpu provides zero'ed memory */ ro->uniq = alloc_percpu(struct uniqframe); @@ -604,6 +618,15 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; + case CAN_RAW_JOIN_FILTERS: + if (optlen != sizeof(ro->join_filters)) + return -EINVAL; + + if (copy_from_user(&ro->join_filters, optval, optlen)) + return -EFAULT; + + break; + default: return -ENOPROTOOPT; } @@ -668,6 +691,12 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, val = &ro->fd_frames; break; + case CAN_RAW_JOIN_FILTERS: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->join_filters; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:20 -0700 Subject: tracing, perf: Implement BPF programs attached to kprobes BPF programs, attached to kprobes, provide a safe way to execute user-defined BPF byte-code programs without being able to crash or hang the kernel in any way. The BPF engine makes sure that such programs have a finite execution time and that they cannot break out of their sandbox. The user interface is to attach to a kprobe via the perf syscall: struct perf_event_attr attr = { .type = PERF_TYPE_TRACEPOINT, .config = event_id, ... }; event_fd = perf_event_open(&attr,...); ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 'prog_fd' is a file descriptor associated with BPF program previously loaded. 'event_id' is an ID of the kprobe created. Closing 'event_fd': close(event_fd); ... automatically detaches BPF program from it. BPF programs can call in-kernel helper functions to: - lookup/update/delete elements in maps - probe_read - wraper of probe_kernel_read() used to access any kernel data structures BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is architecture dependent) and return 0 to ignore the event and 1 to store kprobe event into the ring buffer. Note, kprobes are a fundamentally _not_ a stable kernel ABI, so BPF programs attached to kprobes must be recompiled for every kernel version and user must supply correct LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 11 ++++ include/uapi/linux/bpf.h | 3 + include/uapi/linux/perf_event.h | 1 + kernel/bpf/syscall.c | 7 ++- kernel/events/core.c | 59 ++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/bpf_trace.c | 130 ++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 8 +++ 8 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/bpf_trace.c (limited to 'include/uapi') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 77325e1a1816..0aa535bc9f05 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -13,6 +13,7 @@ struct trace_array; struct trace_buffer; struct tracer; struct dentry; +struct bpf_prog; struct trace_print_flags { unsigned long mask; @@ -306,6 +307,7 @@ struct ftrace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; + struct bpf_prog *prog; int (*perf_perm)(struct ftrace_event_call *, struct perf_event *); @@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file, event_triggers_post_call(file, tt); } +#ifdef CONFIG_BPF_SYSCALL +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +#else +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + return 1; +} +#endif + enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..b2948feeb70b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, }; /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -151,6 +152,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* checked when prog_type=kprobe */ }; } __attribute__((aligned(8))); @@ -162,6 +164,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3bb40ddadbe5..91803e54ee73 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -381,6 +381,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..504c10b990ef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -16,6 +16,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_buf +#define BPF_PROG_LOAD_LAST_FIELD kern_version static int bpf_prog_load(union bpf_attr *attr) { @@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (attr->insn_cnt >= BPF_MAXINSNS) return -EINVAL; + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) diff --git a/kernel/events/core.c b/kernel/events/core.c index c40c2cac2d8e..5c13862d3e85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -3407,6 +3409,7 @@ errout: } static void perf_event_free_filter(struct perf_event *event); +static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -3416,6 +3419,7 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); + perf_event_free_bpf_prog(event); kfree(event); } @@ -3928,6 +3932,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -3981,6 +3986,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_SET_BPF: + return perf_event_set_bpf_prog(event, arg); + default: return -ENOTTY; } @@ -6455,6 +6463,49 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + #else static inline void perf_tp_register(void) @@ -6470,6 +6521,14 @@ static void perf_event_free_filter(struct perf_event *event) { } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ +} #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..c575a300103b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..f1e87da91da3 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,130 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8fa549f6f528..dc3462507d7c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1134,11 +1134,15 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; -- cgit v1.2.3 From d9847d310ab4003725e6ed1822682e24bd406908 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:21 -0700 Subject: tracing: Allow BPF programs to call bpf_ktime_get_ns() bpf_ktime_get_ns() is used by programs to compute time delta between events or as a timestamp Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-5-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b2948feeb70b..238c6883877b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ + BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1e87da91da3..8f5787294971 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -78,6 +78,18 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -89,6 +101,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_map_delete_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; default: return NULL; } -- cgit v1.2.3 From 9c959c863f8217a2ff3d7c296e8223654d240569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:22 -0700 Subject: tracing: Allow BPF programs to call bpf_trace_printk() Debugging of BPF programs needs some form of printk from the program, so let programs call limited trace_printk() with %d %u %x %p modifiers only. Similar to kernel modules, during program load verifier checks whether program is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers and emits big 'this is debug only' banner. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-6-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 238c6883877b..cc47ef41076a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ + BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8f5787294971..2d56ce501632 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "trace.h" static DEFINE_PER_CPU(int, bpf_prog_active); @@ -90,6 +91,74 @@ static const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -103,6 +172,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; default: return NULL; } -- cgit v1.2.3 From e8c6deac69629c0cb97c3d3272f8631ef17f8f0f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:10 +0200 Subject: perf: Add data_{offset,size} to user_page Currently, the actual perf ring buffer is one page into the mmap area, following the user page and the userspace follows this convention. This patch adds data_{offset,size} fields to user_page that can be used by userspace instead for locating perf data in the mmap area. This is also helpful when mapping existing or shared buffers if their size is not known in advance. Right now, it is made to follow the existing convention that data_offset == PAGE_SIZE and data_offset + data_size == mmap_size. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 5 +++++ kernel/events/core.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 91803e54ee73..86c44ae66d43 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -522,9 +522,14 @@ struct perf_event_mmap_page { * In this case the kernel will not over-write unread data. * * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. */ __u64 data_head; /* head in the data section */ __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ }; #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5c13862d3e85..6efa516f1ab8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4105,6 +4105,8 @@ static void perf_event_init_userpage(struct perf_event *event) /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + userpg->data_offset = PAGE_SIZE; + userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); -- cgit v1.2.3 From 45bfb2e50471abbbfd83d40d28c986078b0d24ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2015 14:18:11 +0200 Subject: perf: Add AUX area to ring buffer for raw data streams This patch introduces "AUX space" in the perf mmap buffer, intended for exporting high bandwidth data streams to userspace, such as instruction flow traces. AUX space is a ring buffer, defined by aux_{offset,size} fields in the user_page structure, and read/write pointers aux_{head,tail}, which abide by the same rules as data_* counterparts of the main perf buffer. In order to allocate/mmap AUX, userspace needs to set up aux_offset to such an offset that will be greater than data_offset+data_size and aux_size to be the desired buffer size. Both need to be page aligned. Then, same aux_offset and aux_size should be passed to mmap() call and if everything adds up, you should have an AUX buffer as a result. Pages that are mapped into this buffer also come out of user's mlock rlimit plus perf_event_mlock_kb allowance. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 17 +++++ include/uapi/linux/perf_event.h | 16 +++++ kernel/events/core.c | 141 +++++++++++++++++++++++++++++++++------- kernel/events/internal.h | 23 +++++++ kernel/events/ring_buffer.c | 97 +++++++++++++++++++++++++-- 5 files changed, 265 insertions(+), 29 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 401554074de9..5a94f6d6fa91 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -284,6 +284,18 @@ struct pmu { * Return the count value for a counter. */ u64 (*count) (struct perf_event *event); /*optional*/ + + /* + * Set up pmu-private data structures for an AUX area + */ + void *(*setup_aux) (int cpu, void **pages, + int nr_pages, bool overwrite); + /* optional */ + + /* + * Free pmu-private AUX data structures + */ + void (*free_aux) (void *aux); /* optional */ }; /** @@ -862,6 +874,11 @@ static inline bool needs_branch_stack(struct perf_event *event) return event->attr.branch_sample_type != 0; } +static inline bool has_aux(struct perf_event *event) +{ + return event->pmu->setup_aux; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 86c44ae66d43..6c5013a71714 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -530,6 +530,22 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ __u64 data_offset; /* where the buffer starts */ __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; }; #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6efa516f1ab8..da51128c337a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4306,6 +4306,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); + if (vma->vm_pgoff) + atomic_inc(&event->rb->aux_mmap_count); + if (event->pmu->event_mapped) event->pmu->event_mapped(event); } @@ -4330,6 +4333,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (event->pmu->event_unmapped) event->pmu->event_unmapped(event); + /* + * rb->aux_mmap_count will always drop before rb->mmap_count and + * event->mmap_count, so it is ok to use event->mmap_mutex to + * serialize with perf_mmap here. + */ + if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + + rb_free_aux(rb); + mutex_unlock(&event->mmap_mutex); + } + atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) @@ -4403,7 +4420,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, + .close = perf_mmap_close, /* non mergable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; @@ -4414,10 +4431,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct ring_buffer *rb; + struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; - long user_extra, extra; + long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* @@ -4432,7 +4449,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (vma->vm_pgoff == 0) { + nr_pages = (vma_size / PAGE_SIZE) - 1; + } else { + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + u64 aux_offset, aux_size; + + if (!event->rb) + return -EINVAL; + + nr_pages = vma_size / PAGE_SIZE; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; + + rb = event->rb; + if (!rb) + goto aux_unlock; + + aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); + aux_size = ACCESS_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + goto aux_unlock; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + goto aux_unlock; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + goto aux_unlock; + + if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + goto aux_unlock; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + goto aux_unlock; + + if (!is_power_of_2(nr_pages)) + goto aux_unlock; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + goto aux_unlock; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + ret = 0; + goto unlock; + } + + atomic_set(&rb->aux_mmap_count, 1); + user_extra = nr_pages; + + goto accounting; + } /* * If we have rb pages ensure they're a power-of-two number, so we @@ -4444,9 +4520,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - if (vma->vm_pgoff != 0) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); @@ -4470,6 +4543,8 @@ again: } user_extra = nr_pages + 1; + +accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -4479,7 +4554,6 @@ again: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - extra = 0; if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; @@ -4493,35 +4567,45 @@ again: goto unlock; } - WARN_ON(event->rb); + WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - if (!rb) { - ret = -ENOMEM; - goto unlock; - } + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); - atomic_set(&rb->mmap_count, 1); - rb->mmap_locked = extra; - rb->mmap_user = get_current_user(); + if (!rb) { + ret = -ENOMEM; + goto unlock; + } - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - ring_buffer_attach(event, rb); + ring_buffer_attach(event, rb); - perf_event_init_userpage(event); - perf_event_update_userpage(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + } else { + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags); + if (!ret) + rb->aux_mmap_locked = extra; + } unlock: - if (!ret) + if (!ret) { + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->pinned_vm += extra; + atomic_inc(&event->mmap_count); + } else if (rb) { + atomic_dec(&rb->mmap_count); + } +aux_unlock: mutex_unlock(&event->mmap_mutex); /* @@ -7506,6 +7590,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->clock != event->clock) goto out; + /* + * If both events generate aux data, they must be on the same PMU + */ + if (has_aux(event) && has_aux(output_event) && + event->pmu != output_event->pmu) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..0f6d08015927 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,6 +35,16 @@ struct ring_buffer { unsigned long mmap_locked; struct user_struct *mmap_user; + /* AUX area */ + unsigned long aux_pgoff; + int aux_nr_pages; + atomic_t aux_mmap_count; + unsigned long aux_mmap_locked; + void (*free_aux)(void *); + atomic_t aux_refcount; + void **aux_pages; + void *aux_priv; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -43,6 +53,14 @@ extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); +extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, int flags); +extern void rb_free_aux(struct ring_buffer *rb); + +static inline bool rb_has_aux(struct ring_buffer *rb) +{ + return !!rb->aux_nr_pages; +} extern void perf_event_header__init_id(struct perf_event_header *header, @@ -81,6 +99,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } +static inline unsigned long perf_aux_size(struct ring_buffer *rb) +{ + return rb->aux_nr_pages << PAGE_SHIFT; +} + #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index eadb95ce7aac..3de9c4e9ea9f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -243,14 +243,87 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, int flags) +{ + bool overwrite = !(flags & RING_BUFFER_WRITABLE); + int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); + int ret = -ENOMEM; + + if (!has_aux(event)) + return -ENOTSUPP; + + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + if (!rb->aux_pages) + return -ENOMEM; + + rb->free_aux = event->pmu->free_aux; + for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages; + rb->aux_nr_pages++) { + struct page *page; + + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + goto out; + + rb->aux_pages[rb->aux_nr_pages] = page_address(page); + } + + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + overwrite); + if (!rb->aux_priv) + goto out; + + ret = 0; + + /* + * aux_pages (and pmu driver's private data, aux_priv) will be + * referenced in both producer's and consumer's contexts, thus + * we keep a refcount here to make sure either of the two can + * reference them safely. + */ + atomic_set(&rb->aux_refcount, 1); + +out: + if (!ret) + rb->aux_pgoff = pgoff; + else + rb_free_aux(rb); + + return ret; +} + +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + for (pg = 0; pg < rb->aux_nr_pages; pg++) + free_page((unsigned long)rb->aux_pages[pg]); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; +} + +void rb_free_aux(struct ring_buffer *rb) +{ + if (atomic_dec_and_test(&rb->aux_refcount)) + __rb_free_aux(rb); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -340,8 +413,8 @@ static int data_page_nr(struct ring_buffer *rb) return rb->nr_pages << page_order(rb); } -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -416,3 +489,19 @@ fail: } #endif + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (rb->aux_nr_pages) { + /* above AUX space */ + if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) + return NULL; + + /* AUX space */ + if (pgoff >= rb->aux_pgoff) + return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + } + + return __perf_mmap_to_page(rb, pgoff); +} -- cgit v1.2.3 From 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:15 +0200 Subject: perf: Add AUX record When there's new data in the AUX space, output a record indicating its offset and size and a set of flags, such as PERF_AUX_FLAG_TRUNCATED, to mean the described data was truncated to fit in the ring buffer. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-7-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 19 +++++++++++++++++++ kernel/events/core.c | 34 ++++++++++++++++++++++++++++++++++ kernel/events/internal.h | 3 +++ 3 files changed, 56 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 6c5013a71714..8904ad3a850b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -768,6 +768,20 @@ enum perf_event_type { */ PERF_RECORD_MMAP2 = 10, + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -785,6 +799,11 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ + #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d9fdaef7b57..dbc2eff32230 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5916,6 +5916,40 @@ void perf_event_mmap(struct vm_area_struct *vma) perf_event_mmap_event(&mmap_event); } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 offset; + u64 size; + u64 flags; + } rec = { + .header = { + .type = PERF_RECORD_AUX, + .misc = 0, + .size = sizeof(rec), + }, + .offset = head, + .size = size, + .flags = flags, + }; + int ret; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * IRQ throttle logging */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 0f6d08015927..4d117a981431 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -62,6 +62,9 @@ static inline bool rb_has_aux(struct ring_buffer *rb) return !!rb->aux_nr_pages; } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags); + extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, -- cgit v1.2.3 From 2023a0d2829e521fe6ad6b9907f3f90bfbf57142 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:17 +0200 Subject: perf: Support overwrite mode for the AUX area This adds support for overwrite mode in the AUX area, which means "keep collecting data till you're stopped", turning AUX area into a circular buffer, where new data overwrites old data. It does not depend on data buffer's overwrite mode, so that it doesn't lose sideband data that is instrumental for processing AUX data. Overwrite mode is enabled at mapping AUX area read only. Even though aux_tail in the buffer's user page might be user writable, it will be ignored in this mode. A PERF_RECORD_AUX with PERF_AUX_FLAG_OVERWRITE set is written to the perf data stream every time an event writes new data to the AUX area. The pmu driver might not be able to infer the exact beginning of the new data in each snapshot, some drivers will only provide the tail, which is aux_offset + aux_size in the AUX record. Consumer has to be able to tell the new data from the old one, for example, by means of time stamps if such are provided in the trace. Consumer is also responsible for disabling any events that might write to the AUX area (thus potentially racing with the consumer) before collecting the data. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-9-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 1 + kernel/events/internal.h | 1 + kernel/events/ring_buffer.c | 48 ++++++++++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 8904ad3a850b..29ef2f73bb4a 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -803,6 +803,7 @@ enum perf_callchain_context { * PERF_RECORD_AUX::flags bits */ #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b701ebc32570..ffd51d9f5945 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -40,6 +40,7 @@ struct ring_buffer { local_t aux_nest; unsigned long aux_pgoff; int aux_nr_pages; + int aux_overwrite; atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 0cc7b0f39058..67b328337a41 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -283,26 +283,33 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, goto err_put; aux_head = local_read(&rb->aux_head); - aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); handle->rb = rb; handle->event = event; handle->head = aux_head; - if (aux_head - aux_tail < perf_aux_size(rb)) - handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); - else - handle->size = 0; + handle->size = 0; /* - * handle->size computation depends on aux_tail load; this forms a - * control dependency barrier separating aux_tail load from aux data - * store that will be enabled on successful return + * In overwrite mode, AUX data stores do not depend on aux_tail, + * therefore (A) control dependency barrier does not exist. The + * (B) <-> (C) ordering is still observed by the pmu driver. */ - if (!handle->size) { /* A, matches D */ - event->pending_disable = 1; - perf_output_wakeup(handle); - local_set(&rb->aux_nest, 0); - goto err_put; + if (!rb->aux_overwrite) { + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + if (aux_head - aux_tail < perf_aux_size(rb)) + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); + + /* + * handle->size computation depends on aux_tail load; this forms a + * control dependency barrier separating aux_tail load from aux data + * store that will be enabled on successful return + */ + if (!handle->size) { /* A, matches D */ + event->pending_disable = 1; + perf_output_wakeup(handle); + local_set(&rb->aux_nest, 0); + goto err_put; + } } return handle->rb->aux_priv; @@ -327,13 +334,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; - unsigned long aux_head = local_read(&rb->aux_head); + unsigned long aux_head; u64 flags = 0; if (truncated) flags |= PERF_AUX_FLAG_TRUNCATED; - local_add(size, &rb->aux_head); + /* in overwrite mode, driver provides aux_head via handle */ + if (rb->aux_overwrite) { + flags |= PERF_AUX_FLAG_OVERWRITE; + + aux_head = handle->head; + local_set(&rb->aux_head, aux_head); + } else { + aux_head = local_read(&rb->aux_head); + local_add(size, &rb->aux_head); + } if (size || flags) { /* @@ -480,6 +496,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, */ atomic_set(&rb->aux_refcount, 1); + rb->aux_overwrite = overwrite; + out: if (!ret) rb->aux_pgoff = pgoff; -- cgit v1.2.3 From 1a5941312414c71dece6717da9a0fa1303127afa Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:18 +0200 Subject: perf: Add wakeup watermark control to the AUX area When AUX area gets a certain amount of new data, we want to wake up userspace to collect it. This adds a new control to specify how much data will cause a wakeup. This is then passed down to pmu drivers via output handle's "wakeup" field, so that the driver can find the nearest point where it can generate an interrupt. We repurpose __reserved_2 in the event attribute for this, even though it was never checked to be zero before, aux_watermark will only matter for new AUX-aware code, so the old code should still be fine. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-10-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 7 +++++++ kernel/events/core.c | 3 ++- kernel/events/internal.h | 4 +++- kernel/events/ring_buffer.c | 22 +++++++++++++++++++--- 4 files changed, 31 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 29ef2f73bb4a..84819546c8ce 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -261,6 +261,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ /* add: sample_stack_user */ #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -366,6 +367,12 @@ struct perf_event_attr { * See asm/perf_regs.h for details. */ __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u32 __reserved_2; /* align to __u64 */ }; #define perf_flags(attr) (*(&(attr)->read_format + 1)) diff --git a/kernel/events/core.c b/kernel/events/core.c index 81e8d14ac59a..31f6b504ad62 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4677,7 +4677,8 @@ accounting: perf_event_init_userpage(event); perf_event_update_userpage(event); } else { - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags); + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ffd51d9f5945..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,6 +27,7 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; @@ -38,6 +39,7 @@ struct ring_buffer { /* AUX area */ local_t aux_head; local_t aux_nest; + local_t aux_wakeup; unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; @@ -57,7 +59,7 @@ extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, - pgoff_t pgoff, int nr_pages, int flags); + pgoff_t pgoff, int nr_pages, long watermark, int flags); extern void rb_free_aux(struct ring_buffer *rb); extern struct ring_buffer *ring_buffer_get(struct perf_event *event); extern void ring_buffer_put(struct ring_buffer *rb); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 67b328337a41..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -296,6 +296,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -359,9 +360,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, perf_event_aux_event(handle->event, aux_head, size, flags); } - rb->user_page->aux_head = local_read(&rb->aux_head); + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - perf_output_wakeup(handle); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + } handle->event = NULL; local_set(&rb->aux_nest, 0); @@ -383,6 +387,14 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) local_add(size, &rb->aux_head); + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + handle->wakeup = local_read(&rb->aux_wakeup) + + rb->aux_watermark; + } + handle->head = aux_head; handle->size -= size; @@ -433,7 +445,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) } int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, - pgoff_t pgoff, int nr_pages, int flags) + pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); @@ -497,6 +509,10 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, atomic_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; + rb->aux_watermark = watermark; + + if (!rb->aux_watermark && !rb->aux_overwrite) + rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); out: if (!ret) -- cgit v1.2.3 From ec0d7729bbaed4b9d2d3fada693278e13a3d1368 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:23 +0200 Subject: perf: Add ITRACE_START record to indicate that tracing has started For counters that generate AUX data that is bound to the context of a running task, such as instruction tracing, the decoder needs to know exactly which task is running when the event is first scheduled in, before the first sched_switch. The decoder's need to know this stems from the fact that instruction flow trace decoding will almost always require program's object code in order to reconstruct said flow and for that we need at least its pid/tid in the perf stream. To single out such instruction tracing pmus, this patch introduces ITRACE PMU capability. The reason this is not part of RECORD_AUX record is that not all pmus capable of generating AUX data need this, and the opposite is *probably* also true. While sched_switch covers for most cases, there are two problems with it: the consumer will need to process events out of order (that is, having found RECORD_AUX, it will have to skip forward to the nearest sched_switch to figure out which task it was, then go back to the actual trace to decode it) and it completely misses the case when the tracing is enabled and disabled before sched_switch, for example, via PERF_EVENT_IOC_DISABLE. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-15-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ include/uapi/linux/perf_event.h | 11 +++++++++++ kernel/events/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 45c5873ad9b3..61992cf2e977 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,6 +129,9 @@ struct hw_perf_event { struct list_head cqm_groups_entry; struct list_head cqm_group_entry; }; + struct { /* itrace */ + int itrace_started; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* @@ -177,6 +180,7 @@ struct perf_event; #define PERF_PMU_CAP_AUX_NO_SG 0x04 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 +#define PERF_PMU_CAP_ITRACE 0x20 /** * struct pmu - generic performance monitoring unit diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 84819546c8ce..309211b3eb67 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -789,6 +789,17 @@ enum perf_event_type { */ PERF_RECORD_AUX = 11, + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 31f6b504ad62..06917d537302 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1831,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); +static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, @@ -1869,6 +1870,8 @@ event_sched_in(struct perf_event *event, perf_set_shadow_time(event, ctx, tstamp); + perf_log_itrace_start(event); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -5991,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +static void perf_log_itrace_start(struct perf_event *event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u32 pid; + u32 tid; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || + event->hw.itrace_started) + return; + + event->hw.itrace_started = 1; + + rec.header.type = PERF_RECORD_ITRACE_START; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.pid = perf_event_pid(event, current); + rec.tid = perf_event_tid(event, current); + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * Generic event overflow handling, sampling. */ -- cgit v1.2.3 From bdf765071a8b573f7b1ba14c02881fa3e623825e Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Fri, 30 Jan 2015 13:57:01 -0500 Subject: drm/tegra: gem: Return 64-bit offset for mmap(2) On 64-bit targets, tegra_gem_mmap() only returns a partial offset to userspace. As such, subsequent calls to mmap(2) may fail. Change the arguments to use a 64-bit offset to fix this. Signed-off-by: Sean Paul Acked-by: Erik Faye-Lund [treding@nvidia.com: tweak commit message] Signed-off-by: Thierry Reding --- include/uapi/drm/tegra_drm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/tegra_drm.h b/include/uapi/drm/tegra_drm.h index c15d781ecc0f..5391780c2b05 100644 --- a/include/uapi/drm/tegra_drm.h +++ b/include/uapi/drm/tegra_drm.h @@ -36,7 +36,8 @@ struct drm_tegra_gem_create { struct drm_tegra_gem_mmap { __u32 handle; - __u32 offset; + __u32 pad; + __u64 offset; }; struct drm_tegra_syncpt_read { -- cgit v1.2.3 From bcad57182425426dd4aa14deb27f97acb329f3cd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2015 20:52:24 +0200 Subject: ebpf: add skb->priority to offset map for usage in {cls, act}_bpf This adds the ability to read out the skb->priority from an eBPF program, so that it can be taken into account from a tc filter or action for the use-case where the priority is not being used to directly override the filter classification in a qdisc, but to tag traffic otherwise for the classifier; the priority can be assigned from various places incl. user space, in future we may also mangle it from an eBPF program. Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 74aab6e0d964..0db8580f3cca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -184,6 +184,7 @@ struct __sk_buff { __u32 vlan_present; __u32 vlan_tci; __u32 vlan_proto; + __u32 priority; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index 444a07e4f68d..955a7d77decd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1304,6 +1304,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, offsetof(struct sk_buff, vlan_proto)); break; + case offsetof(struct __sk_buff, priority): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + break; + case offsetof(struct __sk_buff, mark): return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); -- cgit v1.2.3 From 91bc4822c3d61b9bb7ef66d3b77948a4f9177954 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Apr 2015 17:12:13 -0700 Subject: tc: bpf: add checksum helpers Commit 608cd71a9c7c ("tc: bpf: generalize pedit action") has added the possibility to mangle packet data to BPF programs in the tc pipeline. This patch adds two helpers bpf_l3_csum_replace() and bpf_l4_csum_replace() for fixing up the protocol checksums after the packet mangling. It also adds 'flags' argument to bpf_skb_store_bytes() helper to avoid unnecessary checksum recomputations when BPF programs adjusting l3/l4 checksums and documents all three helpers in uapi header. Moreover, a sample program is added to show how BPF programs can make use of the mangle and csum helpers. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 38 +++++++++++++++- net/core/filter.c | 108 ++++++++++++++++++++++++++++++++++++++++++++-- samples/bpf/Makefile | 1 + samples/bpf/bpf_helpers.h | 7 +++ samples/bpf/tcbpf1_kern.c | 71 ++++++++++++++++++++++++++++++ 5 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 samples/bpf/tcbpf1_kern.c (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0db8580f3cca..23df3e7f8e7d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,7 +168,43 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ - BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ + + /** + * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet + * @skb: pointer to skb + * @offset: offset within packet from skb->data + * @from: pointer where to copy bytes from + * @len: number of bytes to store into packet + * @flags: bit 0 - if true, recompute skb->csum + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_skb_store_bytes, + + /** + * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum + * @skb: pointer to skb + * @offset: offset within packet where IP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_l3_csum_replace, + + /** + * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum + * @skb: pointer to skb + * @offset: offset within packet where TCP/UDP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * bit 4 - is pseudo header + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_l4_csum_replace, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 955a7d77decd..b669e75d2b36 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,7 +1175,9 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) + +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; unsigned int offset = (unsigned int) r2; @@ -1192,7 +1194,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) * * so check for invalid 'offset' and too large 'len' */ - if (offset > 0xffff || len > sizeof(buf)) + if (unlikely(offset > 0xffff || len > sizeof(buf))) return -EFAULT; if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) @@ -1202,7 +1204,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) if (unlikely(!ptr)) return -EFAULT; - skb_postpull_rcsum(skb, ptr, len); + if (BPF_RECOMPUTE_CSUM(flags)) + skb_postpull_rcsum(skb, ptr, len); memcpy(ptr, from, len); @@ -1210,7 +1213,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); return 0; } @@ -1223,6 +1226,99 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_STACK, .arg4_type = ARG_CONST_STACK_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) +#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) + +static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __sum16 sum, *ptr; + + if (unlikely(offset > 0xffff)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); + if (unlikely(!ptr)) + return -EFAULT; + + switch (BPF_HEADER_FIELD_SIZE(flags)) { + case 2: + csum_replace2(ptr, from, to); + break; + case 4: + csum_replace4(ptr, from, to); + break; + default: + return -EINVAL; + } + + if (ptr == &sum) + /* skb_store_bits guaranteed to not return -EFAULT here */ + skb_store_bits(skb, offset, ptr, sizeof(sum)); + + return 0; +} + +const struct bpf_func_proto bpf_l3_csum_replace_proto = { + .func = bpf_l3_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + __sum16 sum, *ptr; + + if (unlikely(offset > 0xffff)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); + if (unlikely(!ptr)) + return -EFAULT; + + switch (BPF_HEADER_FIELD_SIZE(flags)) { + case 2: + inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); + break; + case 4: + inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); + break; + default: + return -EINVAL; + } + + if (ptr == &sum) + /* skb_store_bits guaranteed to not return -EFAULT here */ + skb_store_bits(skb, offset, ptr, sizeof(sum)); + + return 0; +} + +const struct bpf_func_proto bpf_l4_csum_replace_proto = { + .func = bpf_l4_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * @@ -1250,6 +1346,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; default: return sk_filter_func_proto(func_id); } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b5b3600dcdf5..d24f51bca465 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -17,6 +17,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o always := $(hostprogs-y) always += sockex1_kern.o always += sockex2_kern.o +always += tcbpf1_kern.o HOSTCFLAGS += -I$(objtree)/usr/include diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index ca0333146006..72540ec1f003 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -37,4 +37,11 @@ struct bpf_map_def { unsigned int max_entries; }; +static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = + (void *) BPF_FUNC_skb_store_bytes; +static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l3_csum_replace; +static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l4_csum_replace; + #endif diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c new file mode 100644 index 000000000000..7cf3f42a6e39 --- /dev/null +++ b/samples/bpf/tcbpf1_kern.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +/* compiler workaround */ +#define _htonl __builtin_bswap32 + +static inline void set_dst_mac(struct __sk_buff *skb, char *mac) +{ + bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); +} + +/* use 1 below for ingress qdisc and 0 for egress */ +#if 0 +#undef ETH_HLEN +#define ETH_HLEN 0 +#endif + +#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) +#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) + +static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) +{ + __u8 old_tos = load_byte(skb, TOS_OFF); + + bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); + bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); +} + +#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check)) +#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr)) + +#define IS_PSEUDO 0x10 + +static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) +{ + __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); + bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); + bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); +} + +#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) +static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) +{ + __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); + bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); +} + +SEC("classifier") +int bpf_prog1(struct __sk_buff *skb) +{ + __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + if (proto == IPPROTO_TCP) { + set_ip_tos(skb, 8); + set_tcp_ip_src(skb, 0xA010101); + set_tcp_dest_port(skb, 5001); + } + + return 0; +} +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 9a9634545c7051f567096117d417e9c3be24706d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 7 Apr 2015 11:51:53 +0200 Subject: netns: notify netns id events With this patch, netns ids that are created and deleted are advertised into the group RTNLGRP_NSID. Because callers of rtnl_net_notifyid() already know the id of the peer, there is no need to call __peernet2id() in rtnl_net_fill(). Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 4 ++++ net/core/net_namespace.c | 52 +++++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index bea910f924dd..974db03f7b1a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -134,6 +134,8 @@ enum { RTM_NEWNSID = 88, #define RTM_NEWNSID RTM_NEWNSID + RTM_DELNSID = 89, +#define RTM_DELNSID RTM_DELNSID RTM_GETNSID = 90, #define RTM_GETNSID RTM_GETNSID @@ -635,6 +637,8 @@ enum rtnetlink_groups { #define RTNLGRP_MDB RTNLGRP_MDB RTNLGRP_MPLS_ROUTE, #define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index be28afccfbbb..b3b5f22f0e90 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -148,9 +148,11 @@ static void ops_free_list(const struct pernet_operations *ops, } } +static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, + int id); static int alloc_netid(struct net *net, struct net *peer, int reqid) { - int min = 0, max = 0; + int min = 0, max = 0, id; ASSERT_RTNL(); @@ -159,7 +161,11 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid) max = reqid + 1; } - return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + if (id >= 0) + rtnl_net_notifyid(net, peer, RTM_NEWNSID, id); + + return id; } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -359,8 +365,10 @@ static void cleanup_net(struct work_struct *work) for_each_net(tmp) { int id = __peernet2id(tmp, net, false); - if (id >= 0) + if (id >= 0) { + rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); idr_remove(&tmp->netns_ids, id); + } } idr_destroy(&net->netns_ids); @@ -531,7 +539,8 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, struct net *peer) + int cmd, struct net *net, struct net *peer, + int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; @@ -546,9 +555,13 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - id = __peernet2id(net, peer, false); - if (id < 0) - id = NETNSA_NSID_NOT_ASSIGNED; + if (nsid >= 0) { + id = nsid; + } else { + id = __peernet2id(net, peer, false); + if (id < 0) + id = NETNSA_NSID_NOT_ASSIGNED; + } if (nla_put_s32(skb, NETNSA_NSID, id)) goto nla_put_failure; @@ -589,7 +602,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) } err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer); + RTM_GETNSID, net, peer, -1); if (err < 0) goto err_out; @@ -603,6 +616,29 @@ out: return err; } +static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd, + int id) +{ + struct sk_buff *msg; + int err = -ENOMEM; + + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + if (!msg) + goto out; + + err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id); + if (err < 0) + goto err_out; + + rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + return; + +err_out: + nlmsg_free(msg); +out: + rtnl_set_sk_err(net, RTNLGRP_NSID, err); +} + static int __init net_ns_init(void) { struct net_generic *ng; -- cgit v1.2.3 From 22fe54d5fefcfa98c58cc2f4607dd26d9648b3f5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 5 Apr 2015 14:41:08 +0200 Subject: netfilter: nf_tables: add support for dynamic set updates Add a new "dynset" expression for dynamic set updates. A new set op ->update() is added which, for non existant elements, invokes an initialization callback and inserts the new element. For both new or existing elements the extenstion pointer is returned to the caller to optionally perform timer updates or other actions. Element removal is not supported so far, however that seems to be a rather exotic need and can be added later on. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 17 +++ include/net/netfilter/nf_tables_core.h | 3 + include/uapi/linux/netfilter/nf_tables.h | 27 ++++ net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 10 +- net/netfilter/nf_tables_core.c | 7 + net/netfilter/nft_dynset.c | 218 +++++++++++++++++++++++++++++++ net/netfilter/nft_hash.c | 37 ++++++ 8 files changed, 315 insertions(+), 6 deletions(-) create mode 100644 net/netfilter/nft_dynset.c (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e7e6365c248f..38c3496f7bf2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -196,6 +196,7 @@ struct nft_set_estimate { }; struct nft_set_ext; +struct nft_expr; /** * struct nft_set_ops - nf_tables set operations @@ -218,6 +219,15 @@ struct nft_set_ops { bool (*lookup)(const struct nft_set *set, const struct nft_data *key, const struct nft_set_ext **ext); + bool (*update)(struct nft_set *set, + const struct nft_data *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_data []), + const struct nft_expr *expr, + struct nft_data data[], + const struct nft_set_ext **ext); + int (*insert)(const struct nft_set *set, const struct nft_set_elem *elem); void (*activate)(const struct nft_set *set, @@ -466,6 +476,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, return elem + set->ops->elemsize; } +void *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const struct nft_data *key, + const struct nft_data *data, + u64 timeout, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem); /** @@ -845,6 +860,8 @@ static inline u8 nft_genmask_cur(const struct net *net) return 1 << ACCESS_ONCE(net->nft.gencursor); } +#define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) + /* * Set element transaction helpers */ diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index a75fc8e27cd6..c6f400cfaac8 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -31,6 +31,9 @@ void nft_cmp_module_exit(void); int nft_lookup_module_init(void); void nft_lookup_module_exit(void); +int nft_dynset_module_init(void); +void nft_dynset_module_exit(void); + int nft_bitwise_module_init(void); void nft_bitwise_module_exit(void); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 83441cc4594b..0b87b2f67fe3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -515,6 +515,33 @@ enum nft_lookup_attributes { }; #define NFTA_LOOKUP_MAX (__NFTA_LOOKUP_MAX - 1) +enum nft_dynset_ops { + NFT_DYNSET_OP_ADD, + NFT_DYNSET_OP_UPDATE, +}; + +/** + * enum nft_dynset_attributes - dynset expression attributes + * + * @NFTA_DYNSET_SET_NAME: name of set the to add data to (NLA_STRING) + * @NFTA_DYNSET_SET_ID: uniquely identifier of the set in the transaction (NLA_U32) + * @NFTA_DYNSET_OP: operation (NLA_U32) + * @NFTA_DYNSET_SREG_KEY: source register of the key (NLA_U32) + * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) + * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) + */ +enum nft_dynset_attributes { + NFTA_DYNSET_UNSPEC, + NFTA_DYNSET_SET_NAME, + NFTA_DYNSET_SET_ID, + NFTA_DYNSET_OP, + NFTA_DYNSET_SREG_KEY, + NFTA_DYNSET_SREG_DATA, + NFTA_DYNSET_TIMEOUT, + __NFTA_DYNSET_MAX, +}; +#define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) + /** * enum nft_payload_bases - nf_tables payload expression offset bases * diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 89f73a9e9874..a87d8b8ec730 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -70,7 +70,7 @@ obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o # nf_tables nf_tables-objs += nf_tables_core.o nf_tables_api.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 90b898491da7..598e53eb64b3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3183,11 +3183,11 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, return trans; } -static void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const struct nft_data *key, - const struct nft_data *data, - u64 timeout, gfp_t gfp) +void *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const struct nft_data *key, + const struct nft_data *data, + u64 timeout, gfp_t gfp) { struct nft_set_ext *ext; void *elem; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index ef4dfcbaf149..7caf08a9225d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -239,8 +239,14 @@ int __init nf_tables_core_module_init(void) if (err < 0) goto err6; + err = nft_dynset_module_init(); + if (err < 0) + goto err7; + return 0; +err7: + nft_payload_module_exit(); err6: nft_byteorder_module_exit(); err5: @@ -257,6 +263,7 @@ err1: void nf_tables_core_module_exit(void) { + nft_dynset_module_exit(); nft_payload_module_exit(); nft_byteorder_module_exit(); nft_bitwise_module_exit(); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c new file mode 100644 index 000000000000..eeb72dee78ef --- /dev/null +++ b/net/netfilter/nft_dynset.c @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2015 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_dynset { + struct nft_set *set; + struct nft_set_ext_tmpl tmpl; + enum nft_dynset_ops op:8; + enum nft_registers sreg_key:8; + enum nft_registers sreg_data:8; + u64 timeout; + struct nft_set_binding binding; +}; + +static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1]) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + u64 timeout; + void *elem; + + if (set->size && !atomic_add_unless(&set->nelems, 1, set->size)) + return NULL; + + timeout = priv->timeout ? : set->timeout; + elem = nft_set_elem_init(set, &priv->tmpl, + &data[priv->sreg_key], &data[priv->sreg_data], + timeout, GFP_ATOMIC); + if (elem == NULL) { + if (set->size) + atomic_dec(&set->nelems); + } + return elem; +} + +static void nft_dynset_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set *set = priv->set; + const struct nft_set_ext *ext; + u64 timeout; + + if (set->ops->update(set, &data[priv->sreg_key], nft_dynset_new, + expr, data, &ext)) { + if (priv->op == NFT_DYNSET_OP_UPDATE && + nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + timeout = priv->timeout ? : set->timeout; + *nft_set_ext_expiration(ext) = jiffies + timeout; + return; + } + } + + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { + [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING }, + [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, + [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, + [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, +}; + +static int nft_dynset_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set *set; + u64 timeout; + int err; + + if (tb[NFTA_DYNSET_SET_NAME] == NULL || + tb[NFTA_DYNSET_OP] == NULL || + tb[NFTA_DYNSET_SREG_KEY] == NULL) + return -EINVAL; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]); + if (IS_ERR(set)) { + if (tb[NFTA_DYNSET_SET_ID]) + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_DYNSET_SET_ID]); + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); + switch (priv->op) { + case NFT_DYNSET_OP_ADD: + break; + case NFT_DYNSET_OP_UPDATE: + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; + break; + default: + return -EOPNOTSUPP; + } + + timeout = 0; + if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(tb[NFTA_DYNSET_TIMEOUT])); + } + + priv->sreg_key = ntohl(nla_get_be32(tb[NFTA_DYNSET_SREG_KEY])); + err = nft_validate_input_register(priv->sreg_key); + if (err < 0) + return err; + + if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { + if (!(set->flags & NFT_SET_MAP)) + return -EINVAL; + if (set->dtype == NFT_DATA_VERDICT) + return -EOPNOTSUPP; + + priv->sreg_data = ntohl(nla_get_be32(tb[NFTA_DYNSET_SREG_DATA])); + err = nft_validate_input_register(priv->sreg_data); + if (err < 0) + return err; + } else if (set->flags & NFT_SET_MAP) + return -EINVAL; + + nft_set_ext_prepare(&priv->tmpl); + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); + if (set->flags & NFT_SET_MAP) + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); + if (set->flags & NFT_SET_TIMEOUT) { + if (timeout || set->timeout) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); + } + + priv->timeout = timeout; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + return err; + + priv->set = set; + return 0; +} + +static void nft_dynset_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_DYNSET_SREG_KEY, htonl(priv->sreg_key))) + goto nla_put_failure; + if (priv->set->flags & NFT_SET_MAP && + nla_put_be32(skb, NFTA_DYNSET_SREG_DATA, htonl(priv->sreg_data))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_DYNSET_OP, htonl(priv->op))) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dynset_type; +static const struct nft_expr_ops nft_dynset_ops = { + .type = &nft_dynset_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)), + .eval = nft_dynset_eval, + .init = nft_dynset_init, + .destroy = nft_dynset_destroy, + .dump = nft_dynset_dump, +}; + +static struct nft_expr_type nft_dynset_type __read_mostly = { + .name = "dynset", + .ops = &nft_dynset_ops, + .policy = nft_dynset_policy, + .maxattr = NFTA_DYNSET_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_dynset_module_init(void) +{ + return nft_register_expr(&nft_dynset_type); +} + +void nft_dynset_module_exit(void) +{ + nft_unregister_expr(&nft_dynset_type); +} diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c74e2bf1a1e4..bc23806b7fbe 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -90,6 +90,42 @@ static bool nft_hash_lookup(const struct nft_set *set, return !!he; } +static bool nft_hash_update(struct nft_set *set, const struct nft_data *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_data []), + const struct nft_expr *expr, + struct nft_data data[], + const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = NFT_GENMASK_ANY, + .set = set, + .key = key, + }; + + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) + goto out; + + he = new(set, expr, data); + if (he == NULL) + goto err1; + if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params)) + goto err2; +out: + *ext = &he->ext; + return true; + +err2: + nft_set_elem_destroy(set, he); +err1: + return false; +} + static int nft_hash_insert(const struct nft_set *set, const struct nft_set_elem *elem) { @@ -335,6 +371,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .deactivate = nft_hash_deactivate, .remove = nft_hash_remove, .lookup = nft_hash_lookup, + .update = nft_hash_update, .walk = nft_hash_walk, .features = NFT_SET_MAP | NFT_SET_TIMEOUT, .owner = THIS_MODULE, -- cgit v1.2.3 From 68e942e88add0ac8576fc8397e86495edf3dcea7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 5 Apr 2015 14:43:38 +0200 Subject: netfilter: nf_tables: support optional userdata for set elements Add an userdata set extension and allow the user to attach arbitrary data to set elements. This is intended to hold TLV encoded data like comments or DNS annotations that have no meaning to the kernel. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 38c3496f7bf2..63c44bdfdd3b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -350,6 +350,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time + * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { @@ -358,6 +359,7 @@ enum nft_set_extensions { NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, + NFT_SET_EXT_USERDATA, NFT_SET_EXT_NUM }; @@ -464,6 +466,11 @@ static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ex return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } +static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_USERDATA); +} + static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0b87b2f67fe3..05ee1e0804a3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -292,6 +292,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_FLAGS: bitmask of nft_set_elem_flags (NLA_U32) * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) + * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -300,6 +301,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_FLAGS, NFTA_SET_ELEM_TIMEOUT, NFTA_SET_ELEM_EXPIRATION, + NFTA_SET_ELEM_USERDATA, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 598e53eb64b3..0b96fa0d64b2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2872,6 +2872,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [NFT_SET_EXT_USERDATA] = { + .len = sizeof(struct nft_userdata), + .align = __alignof__(struct nft_userdata), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -2884,6 +2888,8 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -2964,6 +2970,15 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, goto nla_put_failure; } + if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { + struct nft_userdata *udata; + + udata = nft_set_ext_userdata(ext); + if (nla_put(skb, NFTA_SET_ELEM_USERDATA, + udata->len + 1, udata->data)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); return 0; @@ -3232,11 +3247,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_userdata *udata; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; u64 timeout; u32 flags; + u8 ulen; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3325,6 +3342,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_DATA); } + /* The full maximum length of userdata can exceed the maximum + * offset value (U8_MAX) for following extensions, therefor it + * must be the last extension added. + */ + ulen = 0; + if (nla[NFTA_SET_ELEM_USERDATA] != NULL) { + ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]); + if (ulen > 0) + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, + ulen); + } + err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, timeout, GFP_KERNEL); @@ -3334,6 +3363,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; + if (ulen > 0) { + udata = nft_set_ext_userdata(ext); + udata->len = ulen - 1; + nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); + } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) -- cgit v1.2.3 From 01a3d796813d6302af9f828f34b73d21a4b96c9a Mon Sep 17 00:00:00 2001 From: Vlad Zolotarov Date: Mon, 30 Mar 2015 21:35:23 +0300 Subject: if_link: Add an additional parameter to ifla_vf_info for RSS querying Add configuration setting for drivers to allow/block an RSS Redirection Table and a Hash Key querying for discrete VFs. On some devices VF share the mentioned above information with PF and querying it may adduce a theoretical security risk. We want to let a system administrator to decide if he/she wants to take this risk or not. Signed-off-by: Vlad Zolotarov Tested-by: Phil Schmitt Signed-off-by: Jeff Kirsher --- include/linux/if_link.h | 1 + include/linux/netdevice.h | 8 ++++++++ include/uapi/linux/if_link.h | 8 ++++++++ net/core/rtnetlink.c | 32 ++++++++++++++++++++++++++------ 4 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 119130e9298b..da4929927f69 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -14,5 +14,6 @@ struct ifla_vf_info { __u32 linkstate; __u32 min_tx_rate; __u32 max_tx_rate; + __u32 rss_query_en; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf6d9df34d7b..13acb3d8ecdd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -878,6 +878,11 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); + * + * Enable or disable the VF ability to query its RSS Redirection Table and + * Hash Key. This is needed since on some devices VF share this information + * with PF and querying it may adduce a theoretical security risk. + * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); * int (*ndo_setup_tc)(struct net_device *dev, u8 tc) * Called to setup 'tc' number of traffic classes in the net device. This @@ -1099,6 +1104,9 @@ struct net_device_ops { struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + int (*ndo_set_vf_rss_query_en)( + struct net_device *dev, + int vf, bool setting); int (*ndo_setup_tc)(struct net_device *dev, u8 tc); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7ffb18df01ca..d9cd19214b98 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -465,6 +465,9 @@ enum { IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ __IFLA_VF_MAX, }; @@ -509,6 +512,11 @@ struct ifla_vf_link_state { __u32 link_state; }; +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + /* VF ports management section * * Nested layout of set/get msg is: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7a836152359b..358d52a38533 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -818,7 +818,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + - nla_total_size(sizeof(struct ifla_vf_link_state))); + nla_total_size(sizeof(struct ifla_vf_link_state)) + + nla_total_size(sizeof(struct ifla_vf_rss_query_en))); return size; } else return 0; @@ -1132,14 +1133,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_rss_query_en vf_rss_query_en; /* * Not all SR-IOV capable drivers support the - * spoofcheck query. Preset to -1 so the user - * space tool can detect that the driver didn't - * report anything. + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. */ ivi.spoofchk = -1; + ivi.rss_query_en = -1; memset(ivi.mac, 0, sizeof(ivi.mac)); /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero @@ -1152,7 +1155,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = - vf_linkstate.vf = ivi.vf; + vf_linkstate.vf = + vf_rss_query_en.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; @@ -1162,6 +1166,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -1176,7 +1181,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate)) + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en)) goto nla_put_failure; nla_nest_end(skb, vf); } @@ -1290,6 +1298,7 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, + [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1500,6 +1509,17 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivl->link_state); break; } + case IFLA_VF_RSS_QUERY_EN: { + struct ifla_vf_rss_query_en *ivrssq_en; + + ivrssq_en = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rss_query_en) + err = ops->ndo_set_vf_rss_query_en(dev, + ivrssq_en->vf, + ivrssq_en->setting); + break; + } default: err = -EINVAL; break; -- cgit v1.2.3 From 7a6c8c34e5b71ac50e39588e20b39494a9e1d8e5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 10 Apr 2015 12:00:30 -0700 Subject: fou: implement FOU_CMD_GET Cc: Tom Herbert Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 1 + net/ipv4/fou.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index c303588bb767..d2947c52dc67 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -25,6 +25,7 @@ enum { FOU_CMD_UNSPEC, FOU_CMD_ADD, FOU_CMD_DEL, + FOU_CMD_GET, __FOU_CMD_MAX, }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index c244b1a65787..263710259774 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -21,6 +21,7 @@ struct fou { u8 protocol; u8 flags; __be16 port; + u16 type; struct udp_offload udp_offloads; struct list_head list; }; @@ -487,6 +488,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, goto error; } + fou->type = cfg->type; + udp_sk(sk)->encap_type = 1; udp_encap_enable(); @@ -617,6 +620,106 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) return fou_destroy(net, &cfg); } +static int fou_fill_info(struct fou *fou, struct sk_buff *msg) +{ + if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || + nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || + nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || + nla_put_u8(msg, FOU_ATTR_TYPE, fou->type)) + return -1; + + if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) + if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) + return -1; + return 0; +} + +static int fou_dump_info(struct fou *fou, u32 portid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (fou_fill_info(fou, skb) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct fou_net *fn = net_generic(net, fou_net_id); + struct sk_buff *msg; + struct fou_cfg cfg; + struct fou *fout; + __be16 port; + int ret; + + ret = parse_nl_config(info, &cfg); + if (ret) + return ret; + port = cfg.udp_config.local_udp_port; + if (port == 0) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = -ESRCH; + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { + if (port == fout->port) { + ret = fou_dump_info(fout, info->snd_portid, + info->snd_seq, 0, msg, + info->genlhdr->cmd); + break; + } + } + mutex_unlock(&fn->fou_lock); + if (ret < 0) + goto out_free; + + return genlmsg_reply(msg, info); + +out_free: + nlmsg_free(msg); + return ret; +} + +static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct fou_net *fn = net_generic(net, fou_net_id); + struct fou *fout; + int idx = 0, ret; + + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { + if (idx++ < cb->args[0]) + continue; + ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, FOU_CMD_GET); + if (ret) + goto done; + } + mutex_unlock(&fn->fou_lock); + +done: + cb->args[0] = idx; + return skb->len; +} + static const struct genl_ops fou_nl_ops[] = { { .cmd = FOU_CMD_ADD, @@ -630,6 +733,12 @@ static const struct genl_ops fou_nl_ops[] = { .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = FOU_CMD_GET, + .doit = fou_nl_cmd_get_port, + .dumpit = fou_nl_dump, + .policy = fou_nl_policy, + }, }; size_t fou_encap_hlen(struct ip_tunnel_encap *e) -- cgit v1.2.3 From 49499c3e6e18b7677a63316f3ff54a16533dc28f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 02:27:37 +0100 Subject: netfilter: nf_tables: switch registers to 32 bit addressing Switch the nf_tables registers from 128 bit addressing to 32 bit addressing to support so called concatenations, where multiple values can be concatenated over multiple registers for O(1) exact matches of multiple dimensions using sets. The old register values are mapped to areas of 128 bits for compatibility. When dumping register numbers, values are expressed using the old values if they refer to the beginning of a 128 bit area for compatibility. To support concatenations, register loads of less than a full 32 bit value need to be padded. This mainly affects the payload and exthdr expressions, which both unconditionally zero the last word before copying the data. Userspace fully passes the testsuite using both old and new register addressing. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 +++----- include/uapi/linux/netfilter/nf_tables.h | 31 +++++++++++++++++- net/bridge/netfilter/nft_meta_bridge.c | 2 +- net/ipv4/netfilter/nft_redir_ipv4.c | 4 +-- net/ipv6/netfilter/nft_redir_ipv6.c | 4 +-- net/netfilter/nf_tables_api.c | 54 +++++++++++++++++++++++++------- net/netfilter/nf_tables_core.c | 5 +-- net/netfilter/nft_bitwise.c | 4 +-- net/netfilter/nft_byteorder.c | 4 +-- net/netfilter/nft_ct.c | 4 +-- net/netfilter/nft_exthdr.c | 3 +- net/netfilter/nft_immediate.c | 2 +- net/netfilter/nft_lookup.c | 2 +- net/netfilter/nft_meta.c | 7 +++-- net/netfilter/nft_nat.c | 16 +++++----- net/netfilter/nft_payload.c | 3 +- 16 files changed, 110 insertions(+), 48 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f8f27a48bbe9..1f9b848c778c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -64,17 +64,15 @@ struct nft_data { */ struct nft_regs { union { - struct nft_data data[NFT_REG_MAX + 1]; + u32 data[20]; struct nft_verdict verdict; }; }; -static inline void nft_data_copy(struct nft_data *dst, - const struct nft_data *src) +static inline void nft_data_copy(u32 *dst, const struct nft_data *src, + unsigned int len) { - BUILD_BUG_ON(__alignof__(*dst) != __alignof__(u64)); - *(u64 *)&dst->data[0] = *(u64 *)&src->data[0]; - *(u64 *)&dst->data[2] = *(u64 *)&src->data[2]; + memcpy(dst, src, len); } static inline void nft_data_debug(const struct nft_data *data) @@ -502,8 +500,7 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const struct nft_data *key, - const struct nft_data *data, + const u32 *key, const u32 *data, u64 timeout, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05ee1e0804a3..4221a6c3a8a5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -5,16 +5,45 @@ #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 +/** + * enum nft_registers - nf_tables registers + * + * nf_tables used to have five registers: a verdict register and four data + * registers of size 16. The data registers have been changed to 16 registers + * of size 4. For compatibility reasons, the NFT_REG_[1-4] registers still + * map to areas of size 16, the 4 byte registers are addressed using + * NFT_REG32_00 - NFT_REG32_15. + */ enum nft_registers { NFT_REG_VERDICT, NFT_REG_1, NFT_REG_2, NFT_REG_3, NFT_REG_4, - __NFT_REG_MAX + __NFT_REG_MAX, + + NFT_REG32_00 = 8, + MFT_REG32_01, + NFT_REG32_02, + NFT_REG32_03, + NFT_REG32_04, + NFT_REG32_05, + NFT_REG32_06, + NFT_REG32_07, + NFT_REG32_08, + NFT_REG32_09, + NFT_REG32_10, + NFT_REG32_11, + NFT_REG32_12, + NFT_REG32_13, + NFT_REG32_14, + NFT_REG32_15, }; #define NFT_REG_MAX (__NFT_REG_MAX - 1) +#define NFT_REG_SIZE 16 +#define NFT_REG32_SIZE 4 + /** * enum nft_verdicts - nf_tables internal verdicts * diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 99dab70ecae0..a21269b83f16 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -24,7 +24,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, { const struct nft_meta *priv = nft_expr_priv(expr); const struct net_device *in = pkt->in, *out = pkt->out; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; const struct net_bridge_port *p; switch (priv->key) { diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 312cf6f3b6dc..d8d795df9c13 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -27,9 +27,9 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { mr.range[0].min.all = - *(__be16 *)®s->data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min]; mr.range[0].max.all = - *(__be16 *)®s->data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max]; mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index 0eed774815cf..effd393bd517 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -27,9 +27,9 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min], range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max], range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a25fd19453e7..03faf76ce3b8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3201,8 +3201,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const struct nft_data *key, - const struct nft_data *data, + const u32 *key, const u32 *data, u64 timeout, gfp_t gfp) { struct nft_set_ext *ext; @@ -3357,7 +3356,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, &elem.key, &data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.data, data.data, timeout, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4122,14 +4121,47 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, return 0; } +/** + * nft_parse_register - parse a register value from a netlink attribute + * + * @attr: netlink attribute + * + * Parse and translate a register value from a netlink attribute. + * Registers used to be 128 bit wide, these register numbers will be + * mapped to the corresponding 32 bit register numbers. + */ unsigned int nft_parse_register(const struct nlattr *attr) { - return ntohl(nla_get_be32(attr)); + unsigned int reg; + + reg = ntohl(nla_get_be32(attr)); + switch (reg) { + case NFT_REG_VERDICT...NFT_REG_4: + return reg * NFT_REG_SIZE / NFT_REG32_SIZE; + default: + return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + } } EXPORT_SYMBOL_GPL(nft_parse_register); +/** + * nft_dump_register - dump a register value to a netlink attribute + * + * @skb: socket buffer + * @attr: attribute number + * @reg: register number + * + * Construct a netlink attribute containing the register number. For + * compatibility reasons, register numbers being a multiple of 4 are + * translated to the corresponding 128 bit register numbers. + */ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg) { + if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0) + reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE); + else + reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00; + return nla_put_be32(skb, attr, htonl(reg)); } EXPORT_SYMBOL_GPL(nft_dump_register); @@ -4145,14 +4177,13 @@ EXPORT_SYMBOL_GPL(nft_dump_register); */ int nft_validate_register_load(enum nft_registers reg, unsigned int len) { - if (reg <= NFT_REG_VERDICT) + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; - if (reg > NFT_REG_MAX) - return -ERANGE; if (len == 0) return -EINVAL; - if (len > FIELD_SIZEOF(struct nft_data, data)) + if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data)) return -ERANGE; + return 0; } EXPORT_SYMBOL_GPL(nft_validate_register_load); @@ -4200,13 +4231,12 @@ int nft_validate_register_store(const struct nft_ctx *ctx, return 0; default: - if (reg < NFT_REG_1) + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; - if (reg > NFT_REG_MAX) - return -ERANGE; if (len == 0) return -EINVAL; - if (len > FIELD_SIZEOF(struct nft_data, data)) + if (reg * NFT_REG32_SIZE + len > + FIELD_SIZEOF(struct nft_regs, data)) return -ERANGE; if (data != NULL && type != NFT_DATA_VALUE) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 5ef07d17b358..f153b07073af 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -70,7 +70,7 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); u32 mask = nft_cmp_fast_mask(priv->len); - if ((regs->data[priv->sreg].data[0] & mask) == priv->data) + if ((regs->data[priv->sreg] & mask) == priv->data) return; regs->verdict.code = NFT_BREAK; } @@ -81,7 +81,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; unsigned char *ptr; if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) @@ -94,6 +94,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) return false; + *dest = 0; if (priv->len == 2) *(u16 *)dest = *(u16 *)ptr; else if (priv->len == 4) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index aa1147032ace..f1a9be2aecd1 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -30,8 +30,8 @@ static void nft_bitwise_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg].data[0]; - u32 *dst = ®s->data[priv->dreg].data[0]; + const u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; unsigned int i; for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 2ee3e57ad814..fde5145f2e36 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -30,8 +30,8 @@ static void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); - u32 *src = ®s->data[priv->sreg].data[0]; - u32 *dst = ®s->data[priv->dreg].data[0]; + u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; union { u32 u32; u16 u16; } *s, *d; unsigned int i; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index fab8e754b18a..8cbca3432f90 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -35,7 +35,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; @@ -156,7 +156,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #ifdef CONFIG_NF_CONNTRACK_MARK - u32 value = regs->data[priv->sreg].data[0]; + u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 098ffee793d7..ba7aed13e174 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -30,7 +30,7 @@ static void nft_exthdr_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; unsigned int offset = 0; int err; @@ -39,6 +39,7 @@ static void nft_exthdr_eval(const struct nft_expr *expr, goto err; offset += priv->offset; + dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) goto err; return; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 0682f600c7a5..1e8e412eadae 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -29,7 +29,7 @@ static void nft_immediate_eval(const struct nft_expr *expr, { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - nft_data_copy(®s->data[priv->dreg], &priv->data); + nft_data_copy(®s->data[priv->dreg], &priv->data, priv->dlen); } static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index fc7afff81566..ba1466209f2a 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -36,7 +36,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, if (set->ops->lookup(set, ®s->data[priv->sreg], &ext)) { if (set->flags & NFT_SET_MAP) nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext)); + nft_set_ext_data(ext), set->dlen); return; } regs->verdict.code = NFT_BREAK; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 5f744eb61de5..52561e1c31e2 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -31,13 +31,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = pkt->in, *out = pkt->out; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { case NFT_META_LEN: *dest = skb->len; break; case NFT_META_PROTOCOL: + *dest = 0; *(__be16 *)dest = skb->protocol; break; case NFT_META_NFPROTO: @@ -75,11 +76,13 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_IIFTYPE: if (in == NULL) goto err; + *dest = 0; *(u16 *)dest = in->type; break; case NFT_META_OIFTYPE: if (out == NULL) goto err; + *dest = 0; *(u16 *)dest = out->type; break; case NFT_META_SKUID: @@ -185,7 +188,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; - u32 value = regs->data[meta->sreg].data[0]; + u32 value = regs->data[meta->sreg]; switch (meta->key) { case NFT_META_MARK: diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 065cbda63b0a..ee2d71753746 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -49,26 +49,26 @@ static void nft_nat_eval(const struct nft_expr *expr, if (priv->sreg_addr_min) { if (priv->family == AF_INET) { range.min_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_min].data[0]; + regs->data[priv->sreg_addr_min]; range.max_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_max].data[0]; + regs->data[priv->sreg_addr_max]; } else { memcpy(range.min_addr.ip6, - ®s->data[priv->sreg_addr_min].data, - sizeof(struct nft_data)); + ®s->data[priv->sreg_addr_min], + sizeof(range.min_addr.ip6)); memcpy(range.max_addr.ip6, - ®s->data[priv->sreg_addr_max].data, - sizeof(struct nft_data)); + ®s->data[priv->sreg_addr_max], + sizeof(range.max_addr.ip6)); } range.flags |= NF_NAT_RANGE_MAP_IPS; } if (priv->sreg_proto_min) { range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min]; range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max]; range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 5fa997346a23..94fb3b27a2c5 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -23,7 +23,7 @@ static void nft_payload_eval(const struct nft_expr *expr, { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - u32 *dest = ®s->data[priv->dreg].data[0]; + u32 *dest = ®s->data[priv->dreg]; int offset; switch (priv->base) { @@ -43,6 +43,7 @@ static void nft_payload_eval(const struct nft_expr *expr, } offset += priv->offset; + dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; -- cgit v1.2.3 From 7d7402642eaf385aef0772eff5a35e34fc4995d7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 02:27:39 +0100 Subject: netfilter: nf_tables: variable sized set element keys / data This patch changes sets to support variable sized set element keys / data up to 64 bytes each by using variable sized set extensions. This allows to use concatenations with bigger data items suchs as IPv6 addresses. As a side effect, small keys/data now don't require the full 16 bytes of struct nft_data anymore but just the space they need. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 ++++- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 27 ++++++++++++--------------- net/netfilter/nft_hash.c | 4 ++-- net/netfilter/nft_rbtree.c | 3 ++- 5 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 160577bf0f0a..cb42da1011ef 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -158,7 +158,10 @@ struct nft_userdata { * @priv: element private data and extensions */ struct nft_set_elem { - struct nft_data key; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } key; void *priv; }; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4221a6c3a8a5..be8584c95297 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -388,6 +388,9 @@ enum nft_data_attributes { }; #define NFTA_DATA_MAX (__NFTA_DATA_MAX - 1) +/* Maximum length of a value */ +#define NFT_DATA_VALUE_MAXLEN 64 + /** * enum nft_verdict_attributes - nf_tables verdict netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2b3f88f4c70f..ed0e70ea2bc5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2608,7 +2608,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); - if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) + if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; flags = 0; @@ -2634,11 +2634,10 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); - if (desc.dlen == 0 || - desc.dlen > FIELD_SIZEOF(struct nft_data, data)) + if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; } else - desc.dlen = sizeof(struct nft_data); + desc.dlen = sizeof(struct nft_verdict); } else if (flags & NFT_SET_MAP) return -EINVAL; @@ -2854,12 +2853,10 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { - .len = sizeof(struct nft_data), - .align = __alignof__(struct nft_data), + .align = __alignof__(u32), }, [NFT_SET_EXT_DATA] = { - .len = sizeof(struct nft_data), - .align = __alignof__(struct nft_data), + .align = __alignof__(u32), }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), @@ -3299,7 +3296,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, timeout = set->timeout; } - err = nft_data_init(ctx, &elem.key, sizeof(elem.key), &d1, + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3307,7 +3304,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) goto err2; - nft_set_ext_add(&tmpl, NFT_SET_EXT_KEY); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -3342,7 +3339,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err3; } - nft_set_ext_add(&tmpl, NFT_SET_EXT_DATA); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); } /* The full maximum length of userdata can exceed the maximum @@ -3358,7 +3355,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, timeout, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -3393,7 +3390,7 @@ err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_uninit(&data, d2.type); err2: - nft_data_uninit(&elem.key, d1.type); + nft_data_uninit(&elem.key.val, d1.type); err1: return err; } @@ -3460,7 +3457,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; - err = nft_data_init(ctx, &elem.key, sizeof(elem.key), &desc, + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3488,7 +3485,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err3: kfree(trans); err2: - nft_data_uninit(&elem.key, desc.type); + nft_data_uninit(&elem.key.val, desc.type); err1: return err; } diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 767df41d28ea..3f9d45d3d9b7 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -133,7 +133,7 @@ static int nft_hash_insert(const struct nft_set *set, struct nft_hash_cmp_arg arg = { .genmask = nft_genmask_next(read_pnet(&set->pnet)), .set = set, - .key = elem->key.data, + .key = elem->key.val.data, }; return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, @@ -157,7 +157,7 @@ static void *nft_hash_deactivate(const struct nft_set *set, struct nft_hash_cmp_arg arg = { .genmask = nft_genmask_next(read_pnet(&set->pnet)), .set = set, - .key = elem->key.data, + .key = elem->key.val.data, }; rcu_read_lock(); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index b888e0cdf1e2..1c30f41cff5b 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -152,7 +152,8 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, + set->klen); if (d < 0) parent = parent->rb_left; else if (d > 0) -- cgit v1.2.3 From 24477e57412a7a7dea62637ac990bc5c1cff0665 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Apr 2015 19:41:40 +0200 Subject: uapi: ebtables: don't include linux/if.h linux/if.h creates conflicts in userspace with net/if.h By using it here we force userspace to use linux/if.h while net/if.h may be needed. Note that: include/linux/netfilter_ipv4/ip_tables.h and include/linux/netfilter_ipv6/ip6_tables.h don't include linux/if.h and they also refer to IFNAMSIZ, so they are expecting userspace to include use net/if.h from the client program. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 3 ++- include/uapi/linux/netfilter_bridge/ebtables.h | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 34e7a2b7f867..f1bd3962e6b6 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -12,9 +12,10 @@ #ifndef __LINUX_BRIDGE_EFF_H #define __LINUX_BRIDGE_EFF_H +#include +#include #include - /* return values for match() functions */ #define EBT_MATCH 0 #define EBT_NOMATCH 1 diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index ba993360dbe9..773dfe8924c7 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -12,9 +12,7 @@ #ifndef _UAPI__LINUX_BRIDGE_EFF_H #define _UAPI__LINUX_BRIDGE_EFF_H -#include #include -#include #define EBT_TABLE_MAXNAMELEN 32 #define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN -- cgit v1.2.3 From f25ad2e907f110378159fe5e088aa13176faaa5b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 10:46:39 +0100 Subject: netfilter: nf_tables: prepare for expressions associated to set elements Preparation to attach expressions to set elements: add a set extension type to hold an expression and dump the expression information with the set element. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 9 +++++++++ 3 files changed, 18 insertions(+) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e21623cb7b20..d45a871b3da6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -371,6 +371,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element + * @NFT_SET_EXT_EXPR: expression assiociated with the element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { @@ -380,6 +381,7 @@ enum nft_set_extensions { NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, + NFT_SET_EXT_EXPR, NFT_SET_EXT_NUM }; @@ -491,6 +493,11 @@ static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } +static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_EXPR); +} + static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index be8584c95297..f9c5af22a6af 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -322,6 +322,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_TIMEOUT: timeout value (NLA_U64) * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) + * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -331,6 +332,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_TIMEOUT, NFTA_SET_ELEM_EXPIRATION, NFTA_SET_ELEM_USERDATA, + NFTA_SET_ELEM_EXPR, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e97bee59fe08..8830811550ec 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2904,6 +2904,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, + [NFT_SET_EXT_EXPR] = { + .align = __alignof__(struct nft_expr), + }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), .align = __alignof__(u8), @@ -2990,6 +2993,10 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, set->dlen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) && + nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(*nft_set_ext_flags(ext)))) @@ -3276,6 +3283,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_uninit(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); kfree(elem); } -- cgit v1.2.3 From 7c6c6e95a12e46f499749bdd496e53d03950f377 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 10:46:41 +0100 Subject: netfilter: nf_tables: add flag to indicate set contains expressions Add a set flag to indicate that the set is used as a state table and contains expressions for evaluation. This operation is mutually exclusive with the mapping operation, so sets specifying both are rejected. The lookup expression also rejects binding to state tables since it only deals with loopup and map operations. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 8 ++++++-- net/netfilter/nft_lookup.c | 3 +++ 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f9c5af22a6af..48942381d02f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -238,6 +238,7 @@ enum nft_rule_compat_attributes { * @NFT_SET_INTERVAL: set contains intervals * @NFT_SET_MAP: set is used as a dictionary * @NFT_SET_TIMEOUT: set uses timeouts + * @NFT_SET_EVAL: set contains expressions for evaluation */ enum nft_set_flags { NFT_SET_ANONYMOUS = 0x1, @@ -245,6 +246,7 @@ enum nft_set_flags { NFT_SET_INTERVAL = 0x4, NFT_SET_MAP = 0x8, NFT_SET_TIMEOUT = 0x10, + NFT_SET_EVAL = 0x20, }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8830811550ec..78af83bc9c8e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2661,9 +2661,13 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | - NFT_SET_INTERVAL | NFT_SET_MAP | - NFT_SET_TIMEOUT)) + NFT_SET_INTERVAL | NFT_SET_TIMEOUT | + NFT_SET_MAP | NFT_SET_EVAL)) return -EINVAL; + /* Only one of both operations is supported */ + if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) == + (NFT_SET_MAP | NFT_SET_EVAL)) + return -EOPNOTSUPP; } dtype = 0; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index ba1466209f2a..b3c31ef8015d 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -71,6 +71,9 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return PTR_ERR(set); } + if (set->flags & NFT_SET_EVAL) + return -EOPNOTSUPP; + priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) -- cgit v1.2.3 From 3e135cd499bfbec15684fe9c756162d58df4dc77 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 11 Apr 2015 10:46:42 +0100 Subject: netfilter: nft_dynset: dynamic stateful expression instantiation Support instantiating stateful expressions based on a template that are associated with dynamically created set entries. The expressions are evaluated when adding or updating the set element. This allows to maintain per flow state using the existing set infrastructure and expression types, with arbitrary definitions of a flow. Usage is currently restricted to anonymous sets, meaning only a single binding can exist, since the desired semantics of multiple independant bindings haven't been defined so far. Examples (userspace syntax is still WIP): 1. Limit the rate of new SSH connections per host, similar to iptables hashlimit: flow ip saddr timeout 60s \ limit 10/second \ accept 2. Account network traffic between each set of /24 networks: flow ip saddr & 255.255.255.0 . ip daddr & 255.255.255.0 \ counter 3. Account traffic to each host per user: flow skuid . ip daddr \ counter 4. Account traffic for each combination of source address and TCP flags: flow ip saddr . tcp flags \ counter The resulting set content after a Xmas-scan look like this: { 192.168.122.1 . fin | psh | urg : counter packets 1001 bytes 40040, 192.168.122.1 . ack : counter packets 74 bytes 3848, 192.168.122.1 . psh | ack : counter packets 35 bytes 3144 } Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_dynset.c | 54 +++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 48942381d02f..5fa1cd04762e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -567,6 +567,7 @@ enum nft_dynset_ops { * @NFTA_DYNSET_SREG_KEY: source register of the key (NLA_U32) * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) + * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes) */ enum nft_dynset_attributes { NFTA_DYNSET_UNSPEC, @@ -576,6 +577,7 @@ enum nft_dynset_attributes { NFTA_DYNSET_SREG_KEY, NFTA_DYNSET_SREG_DATA, NFTA_DYNSET_TIMEOUT, + NFTA_DYNSET_EXPR, __NFTA_DYNSET_MAX, }; #define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 03699d5c0b4b..513a8ef60a59 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -23,6 +23,7 @@ struct nft_dynset { enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; u64 timeout; + struct nft_expr *expr; struct nft_set_binding binding; }; @@ -30,6 +31,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set_ext *ext; u64 timeout; void *elem; @@ -44,7 +46,13 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, if (elem == NULL) { if (set->size) atomic_dec(&set->nelems); + return NULL; } + + ext = nft_set_elem_ext(set, elem); + if (priv->expr != NULL) + nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + return elem; } @@ -55,18 +63,27 @@ static void nft_dynset_eval(const struct nft_expr *expr, const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set *set = priv->set; const struct nft_set_ext *ext; + const struct nft_expr *sexpr; u64 timeout; if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { + sexpr = NULL; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + sexpr = nft_set_ext_expr(ext); + if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - return; - } - } + } else if (sexpr == NULL) + goto out; + if (sexpr != NULL) + sexpr->ops->eval(sexpr, regs, pkt); + return; + } +out: regs->verdict.code = NFT_BREAK; } @@ -77,6 +94,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -142,10 +160,29 @@ static int nft_dynset_init(const struct nft_ctx *ctx, } else if (set->flags & NFT_SET_MAP) return -EINVAL; + if (tb[NFTA_DYNSET_EXPR] != NULL) { + if (!(set->flags & NFT_SET_EVAL)) + return -EINVAL; + if (!(set->flags & NFT_SET_ANONYMOUS)) + return -EOPNOTSUPP; + + priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); + if (IS_ERR(priv->expr)) + return PTR_ERR(priv->expr); + + err = -EOPNOTSUPP; + if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) + goto err1; + } else if (set->flags & NFT_SET_EVAL) + return -EINVAL; + nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); if (set->flags & NFT_SET_MAP) nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); + if (priv->expr != NULL) + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, + priv->expr->ops->size); if (set->flags & NFT_SET_TIMEOUT) { if (timeout || set->timeout) nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); @@ -155,10 +192,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) - return err; + goto err1; priv->set = set; return 0; + +err1: + if (priv->expr != NULL) + nft_expr_destroy(ctx, priv->expr); + return err; } static void nft_dynset_destroy(const struct nft_ctx *ctx, @@ -167,6 +209,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, struct nft_dynset *priv = nft_expr_priv(expr); nf_tables_unbind_set(ctx, priv->set, &priv->binding); + if (priv->expr != NULL) + nft_expr_destroy(ctx, priv->expr); } static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -184,6 +228,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) goto nla_put_failure; + if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.3 From 996f545fbb0dc9ed4a640b5ef098f51fe28cca5c Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Thu, 26 Feb 2015 12:44:51 +0900 Subject: drm/nouveau/gem: allow user-space to specify an object should be coherent User-space use mappable BOs notably for fences, and expects that a value update by the GPU will be immediatly visible through the user-space mapping. ARM has a property that may prevent this from happening though: memory can be mapped multiple times only if the different mappings share the same caching properties. However all the lowmem memory is already identity-mapped into the kernel with cache enabled, so when user-space requests an uncached mapping, we actually get an "undefined caching policy" one and this has strange side-effects described on Freedesktop bug 86690. To prevent this from happening, allow user-space to explicitly specify which objects should be coherent, and create such objects with the TTM_PL_FLAG_UNCACHED flag. This will make TTM allocate memory using the DMA API, which will fix the identify mapping and allow us to safely map the objects to user-space uncached. Signed-off-by: Alexandre Courbot Reviewed-by: Lucas Stach Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_gem.c | 3 +++ include/uapi/drm/nouveau_drm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 7c077fced1d1..0e690bf19fc9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -189,6 +189,9 @@ nouveau_gem_new(struct drm_device *dev, int size, int align, uint32_t domain, if (!flags || domain & NOUVEAU_GEM_DOMAIN_CPU) flags |= TTM_PL_FLAG_SYSTEM; + if (domain & NOUVEAU_GEM_DOMAIN_COHERENT) + flags |= TTM_PL_FLAG_UNCACHED; + ret = nouveau_bo_new(dev, size, align, flags, tile_mode, tile_flags, NULL, NULL, pnvbo); if (ret) diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index 0d7608dc1a34..5507eead5863 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -39,6 +39,7 @@ #define NOUVEAU_GEM_DOMAIN_VRAM (1 << 1) #define NOUVEAU_GEM_DOMAIN_GART (1 << 2) #define NOUVEAU_GEM_DOMAIN_MAPPABLE (1 << 3) +#define NOUVEAU_GEM_DOMAIN_COHERENT (1 << 4) #define NOUVEAU_GEM_TILE_COMP 0x00030000 /* nv50-only */ #define NOUVEAU_GEM_TILE_LAYOUT_MASK 0x0000ff00 -- cgit v1.2.3 From bfebd1cdb497a57757c83f5fbf1a29931591e2a4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sun, 8 Mar 2015 00:51:47 -0500 Subject: dm: add full blk-mq support to request-based DM Commit e5863d9ad ("dm: allocate requests in target when stacking on blk-mq devices") served as the first step toward fully utilizing blk-mq in request-based DM -- it enabled stacking an old-style (request_fn) request_queue ontop of the underlying blk-mq device(s). That first step didn't improve performance of DM multipath ontop of fast blk-mq devices (e.g. NVMe) because the top-level old-style request_queue was severely limited by the queue_lock. The second step offered here enables stacking a blk-mq request_queue ontop of the underlying blk-mq device(s). This unlocks significant performance gains on fast blk-mq devices, Keith Busch tested on his NVMe testbed and offered this really positive news: "Just providing a performance update. All my fio tests are getting roughly equal performance whether accessed through the raw block device or the multipath device mapper (~470k IOPS). I could only push ~20% of the raw iops through dm before this conversion, so this latest tree is looking really solid from a performance standpoint." Signed-off-by: Mike Snitzer Tested-by: Keith Busch --- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-table.c | 11 +- drivers/md/dm.c | 317 +++++++++++++++++++++++++++++++++--------- include/uapi/linux/dm-ioctl.h | 4 +- 4 files changed, 261 insertions(+), 73 deletions(-) (limited to 'include/uapi') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index add6391f3f8e..c8f07e5a9a17 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1703,7 +1703,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 057312048b68..66600cab9fa5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -18,6 +18,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "table" @@ -1695,9 +1696,13 @@ void dm_table_run_md_queue_async(struct dm_table *t) md = dm_table_get_md(t); queue = dm_get_md_queue(md); if (queue) { - spin_lock_irqsave(queue->queue_lock, flags); - blk_run_queue_async(queue); - spin_unlock_irqrestore(queue->queue_lock, flags); + if (queue->mq_ops) + blk_mq_run_hw_queues(queue, true); + else { + spin_lock_irqsave(queue->queue_lock, flags); + blk_run_queue_async(queue); + spin_unlock_irqrestore(queue->queue_lock, flags); + } } } EXPORT_SYMBOL(dm_table_run_md_queue_async); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5294e016e92b..3a66baac76ed 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -23,6 +23,7 @@ #include #include #include /* for rq_end_sector() */ +#include #include @@ -224,6 +225,9 @@ struct mapped_device { int last_rq_rw; sector_t last_rq_pos; ktime_t last_rq_start_time; + + /* for blk-mq request-based DM support */ + struct blk_mq_tag_set tag_set; }; /* @@ -1025,6 +1029,11 @@ static void end_clone_bio(struct bio *clone, int error) blk_update_request(tio->orig, 0, nr_bytes); } +static struct dm_rq_target_io *tio_from_request(struct request *rq) +{ + return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); +} + /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. @@ -1048,8 +1057,10 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * queue lock again. */ if (run_queue) { - if (!nr_requests_pending || - (nr_requests_pending >= md->queue->nr_congestion_on)) + if (md->queue->mq_ops) + blk_mq_run_hw_queues(md->queue, true); + else if (!nr_requests_pending || + (nr_requests_pending >= md->queue->nr_congestion_on)) blk_run_queue_async(md->queue); } @@ -1062,13 +1073,17 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) static void free_rq_clone(struct request *clone) { struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; blk_rq_unprep_clone(clone); + if (clone->q && clone->q->mq_ops) tio->ti->type->release_clone_rq(clone); else - free_clone_request(tio->md, clone); - free_rq_tio(tio); + free_clone_request(md, clone); + + if (!md->queue->mq_ops) + free_rq_tio(tio); } /* @@ -1097,17 +1112,22 @@ static void dm_end_request(struct request *clone, int error) } free_rq_clone(clone); - blk_end_request_all(rq, error); + if (!rq->q->mq_ops) + blk_end_request_all(rq, error); + else + blk_mq_end_request(rq, error); rq_completed(md, rw, true); } static void dm_unprep_request(struct request *rq) { - struct dm_rq_target_io *tio = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); struct request *clone = tio->clone; - rq->special = NULL; - rq->cmd_flags &= ~REQ_DONTPREP; + if (!rq->q->mq_ops) { + rq->special = NULL; + rq->cmd_flags &= ~REQ_DONTPREP; + } if (clone) free_rq_clone(clone); @@ -1116,18 +1136,29 @@ static void dm_unprep_request(struct request *rq) /* * Requeue the original request of a clone. */ -static void dm_requeue_unmapped_original_request(struct mapped_device *md, - struct request *rq) +static void old_requeue_request(struct request *rq) { - int rw = rq_data_dir(rq); struct request_queue *q = rq->q; unsigned long flags; - dm_unprep_request(rq); - spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, rq); spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void dm_requeue_unmapped_original_request(struct mapped_device *md, + struct request *rq) +{ + int rw = rq_data_dir(rq); + + dm_unprep_request(rq); + + if (!rq->q->mq_ops) + old_requeue_request(rq); + else { + blk_mq_requeue_request(rq); + blk_mq_kick_requeue_list(rq->q); + } rq_completed(md, rw, false); } @@ -1139,35 +1170,44 @@ static void dm_requeue_unmapped_request(struct request *clone) dm_requeue_unmapped_original_request(tio->md, tio->orig); } -static void __stop_queue(struct request_queue *q) -{ - blk_stop_queue(q); -} - -static void stop_queue(struct request_queue *q) +static void old_stop_queue(struct request_queue *q) { unsigned long flags; + if (blk_queue_stopped(q)) + return; + spin_lock_irqsave(q->queue_lock, flags); - __stop_queue(q); + blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void __start_queue(struct request_queue *q) +static void stop_queue(struct request_queue *q) { - if (blk_queue_stopped(q)) - blk_start_queue(q); + if (!q->mq_ops) + old_stop_queue(q); + else + blk_mq_stop_hw_queues(q); } -static void start_queue(struct request_queue *q) +static void old_start_queue(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __start_queue(q); + if (blk_queue_stopped(q)) + blk_start_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } +static void start_queue(struct request_queue *q) +{ + if (!q->mq_ops) + old_start_queue(q); + else + blk_mq_start_stopped_hw_queues(q, true); +} + static void dm_done(struct request *clone, int error, bool mapped) { int r = error; @@ -1206,13 +1246,20 @@ static void dm_done(struct request *clone, int error, bool mapped) static void dm_softirq_done(struct request *rq) { bool mapped = true; - struct dm_rq_target_io *tio = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); struct request *clone = tio->clone; + int rw; if (!clone) { - blk_end_request_all(rq, tio->error); - rq_completed(tio->md, rq_data_dir(rq), false); - free_rq_tio(tio); + rw = rq_data_dir(rq); + if (!rq->q->mq_ops) { + blk_end_request_all(rq, tio->error); + rq_completed(tio->md, rw, false); + free_rq_tio(tio); + } else { + blk_mq_end_request(rq, tio->error); + rq_completed(tio->md, rw, false); + } return; } @@ -1228,7 +1275,7 @@ static void dm_softirq_done(struct request *rq) */ static void dm_complete_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; blk_complete_request(rq); @@ -1247,7 +1294,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) } /* - * Called with the clone's queue lock held + * Called with the clone's queue lock held (for non-blk-mq) */ static void end_clone_request(struct request *clone, int error) { @@ -1808,6 +1855,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, static void map_tio_request(struct kthread_work *work); +static void init_tio(struct dm_rq_target_io *tio, struct request *rq, + struct mapped_device *md) +{ + tio->md = md; + tio->ti = NULL; + tio->clone = NULL; + tio->orig = rq; + tio->error = 0; + memset(&tio->info, 0, sizeof(tio->info)); + init_kthread_work(&tio->work, map_tio_request); +} + static struct dm_rq_target_io *prep_tio(struct request *rq, struct mapped_device *md, gfp_t gfp_mask) { @@ -1819,13 +1878,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq, if (!tio) return NULL; - tio->md = md; - tio->ti = NULL; - tio->clone = NULL; - tio->orig = rq; - tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); - init_kthread_work(&tio->work, map_tio_request); + init_tio(tio, rq, md); table = dm_get_live_table(md, &srcu_idx); if (!dm_table_mq_request_based(table)) { @@ -1869,11 +1922,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) * DM_MAPIO_REQUEUE : the original request needs to be requeued * < 0 : the request was completed due to failure */ -static int map_request(struct dm_target *ti, struct request *rq, +static int map_request(struct dm_rq_target_io *tio, struct request *rq, struct mapped_device *md) { int r; - struct dm_rq_target_io *tio = rq->special; + struct dm_target *ti = tio->ti; struct request *clone = NULL; if (tio->clone) { @@ -1888,7 +1941,7 @@ static int map_request(struct dm_target *ti, struct request *rq, } if (IS_ERR(clone)) return DM_MAPIO_REQUEUE; - if (setup_clone(clone, rq, tio, GFP_KERNEL)) { + if (setup_clone(clone, rq, tio, GFP_NOIO)) { /* -ENOMEM */ ti->type->release_clone_rq(clone); return DM_MAPIO_REQUEUE; @@ -1929,13 +1982,16 @@ static void map_tio_request(struct kthread_work *work) struct request *rq = tio->orig; struct mapped_device *md = tio->md; - if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) dm_requeue_unmapped_original_request(md, rq); } static void dm_start_request(struct mapped_device *md, struct request *orig) { - blk_start_request(orig); + if (!orig->q->mq_ops) + blk_start_request(orig); + else + blk_mq_start_request(orig); atomic_inc(&md->pending[rq_data_dir(orig)]); if (md->seq_rq_merge_deadline_usecs) { @@ -2045,7 +2101,7 @@ static void dm_request_fn(struct request_queue *q) dm_start_request(md, rq); - tio = rq->special; + tio = tio_from_request(rq); /* Establish tio->ti before queuing work (map_tio_request) */ tio->ti = ti; queue_kthread_work(&md->kworker, &tio->work); @@ -2142,7 +2198,7 @@ static void dm_init_md_queue(struct mapped_device *md) { /* * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet. + * devices. The type of this dm device may not have been decided yet. * The type is decided at the first table loading time. * To prevent problematic device stacking, clear the queue flag * for request stacking support until then. @@ -2150,7 +2206,15 @@ static void dm_init_md_queue(struct mapped_device *md) * This queue is new, so no concurrency on the queue_flags. */ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); +} + +static void dm_init_old_md_queue(struct mapped_device *md) +{ + dm_init_md_queue(md); + /* + * Initialize aspects of queue that aren't relevant for blk-mq + */ md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; @@ -2273,6 +2337,7 @@ static void unlock_fs(struct mapped_device *md); static void free_dev(struct mapped_device *md) { int minor = MINOR(disk_devt(md->disk)); + bool using_blk_mq = !!md->queue->mq_ops; unlock_fs(md); destroy_workqueue(md->wq); @@ -2298,6 +2363,8 @@ static void free_dev(struct mapped_device *md) del_gendisk(md->disk); put_disk(md->disk); blk_cleanup_queue(md->queue); + if (using_blk_mq) + blk_mq_free_tag_set(&md->tag_set); bdput(md->bdev); free_minor(minor); @@ -2457,7 +2524,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t) && !blk_queue_stopped(q)) + if (dm_table_request_based(t)) stop_queue(q); __bind_mempools(md, t); @@ -2539,14 +2606,6 @@ unsigned dm_get_md_type(struct mapped_device *md) return md->type; } -static bool dm_md_type_request_based(struct mapped_device *md) -{ - unsigned table_type = dm_get_md_type(md); - - return (table_type == DM_TYPE_REQUEST_BASED || - table_type == DM_TYPE_MQ_REQUEST_BASED); -} - struct target_type *dm_get_immutable_target_type(struct mapped_device *md) { return md->immutable_target_type; @@ -2563,6 +2622,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); +static void init_rq_based_worker_thread(struct mapped_device *md) +{ + /* Initialize the request-based DM worker thread */ + init_kthread_worker(&md->kworker); + md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, + "kdmwork-%s", dm_device_name(md)); +} + /* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ @@ -2571,29 +2638,131 @@ static int dm_init_request_based_queue(struct mapped_device *md) struct request_queue *q = NULL; if (md->queue->elevator) - return 1; + return 0; /* Fully initialize the queue */ q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); if (!q) - return 0; + return -EINVAL; /* disable dm_request_fn's merge heuristic by default */ md->seq_rq_merge_deadline_usecs = 0; md->queue = q; - dm_init_md_queue(md); + dm_init_old_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); blk_queue_prep_rq(md->queue, dm_prep_fn); - /* Also initialize the request-based DM worker thread */ - init_kthread_worker(&md->kworker); - md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, - "kdmwork-%s", dm_device_name(md)); + init_rq_based_worker_thread(md); elv_register_queue(md->queue); - return 1; + return 0; +} + +static int dm_mq_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct mapped_device *md = data; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + + /* + * Must initialize md member of tio, otherwise it won't + * be available in dm_mq_queue_rq. + */ + tio->md = md; + + return 0; +} + +static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + struct mapped_device *md = tio->md; + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + struct dm_target *ti; + sector_t pos; + + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + if (!dm_target_is_valid(ti)) { + dm_put_live_table(md, srcu_idx); + DMERR_LIMIT("request attempted access beyond the end of device"); + /* + * Must perform setup, that rq_completed() requires, + * before returning BLK_MQ_RQ_QUEUE_ERROR + */ + dm_start_request(md, rq); + return BLK_MQ_RQ_QUEUE_ERROR; + } + dm_put_live_table(md, srcu_idx); + + if (ti->type->busy && ti->type->busy(ti)) + return BLK_MQ_RQ_QUEUE_BUSY; + + dm_start_request(md, rq); + + /* Init tio using md established in .init_request */ + init_tio(tio, rq, md); + + /* Establish tio->ti before queuing work (map_tio_request) */ + tio->ti = ti; + queue_kthread_work(&md->kworker, &tio->work); + + return BLK_MQ_RQ_QUEUE_OK; +} + +static struct blk_mq_ops dm_mq_ops = { + .queue_rq = dm_mq_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = dm_softirq_done, + .init_request = dm_mq_init_request, +}; + +static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +{ + struct request_queue *q; + int err; + + memset(&md->tag_set, 0, sizeof(md->tag_set)); + md->tag_set.ops = &dm_mq_ops; + md->tag_set.queue_depth = BLKDEV_MAX_RQ; + md->tag_set.numa_node = NUMA_NO_NODE; + md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set.nr_hw_queues = 1; + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); + md->tag_set.driver_data = md; + + err = blk_mq_alloc_tag_set(&md->tag_set); + if (err) + return err; + + q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_tag_set; + } + md->queue = q; + dm_init_md_queue(md); + + /* backfill 'mq' sysfs registration normally done in blk_register_queue */ + blk_mq_register_disk(md->disk); + + init_rq_based_worker_thread(md); + + return 0; + +out_tag_set: + blk_mq_free_tag_set(&md->tag_set); + return err; } /* @@ -2601,15 +2770,29 @@ static int dm_init_request_based_queue(struct mapped_device *md) */ int dm_setup_md_queue(struct mapped_device *md) { - if (dm_md_type_request_based(md)) { - if (!dm_init_request_based_queue(md)) { + int r; + unsigned md_type = dm_get_md_type(md); + + switch (md_type) { + case DM_TYPE_REQUEST_BASED: + r = dm_init_request_based_queue(md); + if (r) { DMWARN("Cannot initialize queue for request-based mapped device"); - return -EINVAL; + return r; } - } else { - /* bio-based specific initialization */ + break; + case DM_TYPE_MQ_REQUEST_BASED: + r = dm_init_request_based_blk_mq_queue(md); + if (r) { + DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + return r; + } + break; + case DM_TYPE_BIO_BASED: + dm_init_old_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); blk_queue_merge_bvec(md->queue, dm_merge_bvec); + break; } return 0; diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 889f3a5b7b18..eac8c3641f39 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 30 +#define DM_VERSION_MINOR 31 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2014-12-22)" +#define DM_VERSION_EXTRA "-ioctl (2015-3-12)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From a166151cbe33b53221c24259e4a7201064b3ba79 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Apr 2015 12:55:45 -0700 Subject: bpf: fix bpf helpers to use skb->mac_header relative offsets For the short-term solution, lets fix bpf helper functions to use skb->mac_header relative offsets instead of skb->data in order to get the same eBPF programs with cls_bpf and act_bpf work on ingress and egress qdisc path. We need to ensure that mac_header is set before calling into programs. This is effectively the first option from below referenced discussion. More long term solution for LD_ABS|LD_IND instructions will be more intrusive but also more beneficial than this, and implemented later as it's too risky at this point in time. I.e., we plan to look into the option of moving skb_pull() out of eth_type_trans() and into netif_receive_skb() as has been suggested as second option. Meanwhile, this solution ensures ingress can be used with eBPF, too, and that we won't run into ABI troubles later. For dealing with negative offsets inside eBPF helper functions, we've implemented bpf_skb_clone_unwritable() to test for unwriteable headers. Reference: http://thread.gmane.org/gmane.linux.network/359129/focus=359694 Fixes: 608cd71a9c7c ("tc: bpf: generalize pedit action") Fixes: 91bc4822c3d6 ("tc: bpf: add checksum helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/filter.h | 7 +++++-- net/core/filter.c | 41 ++++++++++++++++++++++++++++++++--------- net/sched/act_bpf.c | 3 +++ net/sched/cls_bpf.c | 3 +++ samples/bpf/tcbpf1_kern.c | 16 ++++++---------- 6 files changed, 50 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5c1cee11f777..a9ebdf5701e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -177,7 +177,7 @@ enum bpf_func_id { /** * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet * @skb: pointer to skb - * @offset: offset within packet from skb->data + * @offset: offset within packet from skb->mac_header * @from: pointer where to copy bytes from * @len: number of bytes to store into packet * @flags: bit 0 - if true, recompute skb->csum diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h index 34c7936ca114..c97340e43dd6 100644 --- a/include/uapi/linux/filter.h +++ b/include/uapi/linux/filter.h @@ -79,8 +79,11 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ #define SKF_AD_RANDOM 56 #define SKF_AD_VLAN_TPID 60 #define SKF_AD_MAX 64 -#define SKF_NET_OFF (-0x100000) -#define SKF_LL_OFF (-0x200000) +#define SKF_NET_OFF (-0x100000) +#define SKF_LL_OFF (-0x200000) + +#define BPF_NET_OFF SKF_NET_OFF +#define BPF_LL_OFF SKF_LL_OFF #endif /* _UAPI__LINUX_FILTER_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index b669e75d2b36..bf831a85c315 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,12 +1175,27 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +/** + * bpf_skb_clone_not_writable - is the header of a clone not writable + * @skb: buffer to check + * @len: length up to which to write, can be negative + * + * Returns true if modifying the header part of the cloned buffer + * does require the data to be copied. I.e. this version works with + * negative lengths needed for eBPF case! + */ +static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len) +{ + return skb_header_cloned(skb) || + (int) skb_headroom(skb) + len > skb->hdr_len; +} + #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; + int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; char buf[16]; @@ -1194,10 +1209,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely(offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) return -EFAULT; - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) + offset -= skb->data - skb_mac_header(skb); + if (unlikely(skb_cloned(skb) && + bpf_skb_clone_unwritable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, buf); @@ -1232,15 +1249,18 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { #define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) #define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) -static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; + int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + offset -= skb->data - skb_mac_header(skb); + if (unlikely(skb_cloned(skb) && + bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1276,16 +1296,19 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + offset -= skb->data - skb_mac_header(skb); + if (unlikely(skb_cloned(skb) && + bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 4d2cede17468..dc6a2d324bd8 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -38,6 +38,9 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_bpf *prog = act->priv; int action, filter_res; + if (unlikely(!skb_mac_header_was_set(skb))) + return TC_ACT_UNSPEC; + spin_lock(&prog->tcf_lock); prog->tcf_tm.lastuse = jiffies; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5c4171c5d2bd..91bd9c19471d 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -66,6 +66,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; + if (unlikely(!skb_mac_header_was_set(skb))) + return -1; + /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c index 7cf3f42a6e39..7c27710f8296 100644 --- a/samples/bpf/tcbpf1_kern.c +++ b/samples/bpf/tcbpf1_kern.c @@ -4,6 +4,8 @@ #include #include #include +#include + #include "bpf_helpers.h" /* compiler workaround */ @@ -14,18 +16,12 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac) bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); } -/* use 1 below for ingress qdisc and 0 for egress */ -#if 0 -#undef ETH_HLEN -#define ETH_HLEN 0 -#endif - #define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) #define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) { - __u8 old_tos = load_byte(skb, TOS_OFF); + __u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); @@ -38,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) { - __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); + __u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); @@ -48,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) { - __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); + __u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); @@ -57,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) SEC("classifier") int bpf_prog1(struct __sk_buff *skb) { - __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + __u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); long *value; if (proto == IPPROTO_TCP) { -- cgit v1.2.3 From e15f431fe2d53cd4673510736da7d4fa1090e096 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 16 Apr 2015 12:44:47 -0700 Subject: errno.h: Improve ENOSYS's comment ENOSYS is the mechanism used by user code to detect whether the running kernel implements a given system call. It should not be returned by anything except an unimplemented system call. Unfortunately, it is rather frequently used in the kernel to indicate that various new functions of existing system calls are not implemented. This should be discouraged. Improve the comment in errno.h to help clarify ENOSYS's purpose. Signed-off-by: Andy Lutomirski Cc: Pavel Machek Cc: Michael Kerrisk Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/errno.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h index 1e1ea6e6e7a5..88e0914cf2d9 100644 --- a/include/uapi/asm-generic/errno.h +++ b/include/uapi/asm-generic/errno.h @@ -6,7 +6,16 @@ #define EDEADLK 35 /* Resource deadlock would occur */ #define ENAMETOOLONG 36 /* File name too long */ #define ENOLCK 37 /* No record locks available */ -#define ENOSYS 38 /* Function not implemented */ + +/* + * This error code is special: arch syscall entry code will return + * -ENOSYS if users try to call a syscall that doesn't exist. To keep + * failures of syscalls that really do exist distinguishable from + * failures due to attempts to use a nonexistent syscall, syscall + * implementations should refrain from returning -ENOSYS. + */ +#define ENOSYS 38 /* Invalid system call number */ + #define ENOTEMPTY 39 /* Directory not empty */ #define ELOOP 40 /* Too many symbolic links encountered */ #define EWOULDBLOCK EAGAIN /* Operation would block */ -- cgit v1.2.3 From cec32a47010647e8b0603726ebb75b990a4057a4 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Fri, 17 Apr 2015 19:12:41 +0200 Subject: media-bus: Fixup RGB444_1X12, RGB565_1X16, and YUV8_1X24 media bus format Change the constant values for RGB444_1X12, RGB565_1X16, and YUV8_1X24 media bus formats in anticipation of a merge conflict with the media tree, where the old values are already taken by RBG888_1X24, RGB888_1X32_PADHI, and VUY8_1X24, respectively. Signed-off-by: Philipp Zabel Signed-off-by: Dave Airlie --- Documentation/DocBook/media/v4l/subdev-formats.xml | 6 +++--- include/uapi/linux/media-bus-format.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index 18b71aff48c9..553a38024745 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -196,7 +196,7 @@ see . MEDIA_BUS_FMT_RGB444_1X12 - 0x100e + 0x1016 &dash-ent-20; r3 @@ -326,7 +326,7 @@ see . MEDIA_BUS_FMT_RGB565_1X16 - 0x100f + 0x1017 &dash-ent-16; r4 @@ -3049,7 +3049,7 @@ see . MEDIA_BUS_FMT_YUV8_1X24 - 0x2024 + 0x2025 - - diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 83ea46f4be51..73c78f18a328 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -33,13 +33,13 @@ #define MEDIA_BUS_FMT_FIXED 0x0001 -/* RGB - next is 0x1016 */ -#define MEDIA_BUS_FMT_RGB444_1X12 0x100e +/* RGB - next is 0x1018 */ +#define MEDIA_BUS_FMT_RGB444_1X12 0x1016 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE 0x1003 #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_LE 0x1004 -#define MEDIA_BUS_FMT_RGB565_1X16 0x100f +#define MEDIA_BUS_FMT_RGB565_1X16 0x1017 #define MEDIA_BUS_FMT_BGR565_2X8_BE 0x1005 #define MEDIA_BUS_FMT_BGR565_2X8_LE 0x1006 #define MEDIA_BUS_FMT_RGB565_2X8_BE 0x1007 @@ -56,7 +56,7 @@ #define MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA 0x1012 #define MEDIA_BUS_FMT_ARGB8888_1X32 0x100d -/* YUV (including grey) - next is 0x2025 */ +/* YUV (including grey) - next is 0x2026 */ #define MEDIA_BUS_FMT_Y8_1X8 0x2001 #define MEDIA_BUS_FMT_UV8_1X8 0x2015 #define MEDIA_BUS_FMT_UYVY8_1_5X8 0x2002 @@ -82,7 +82,7 @@ #define MEDIA_BUS_FMT_VYUY10_1X20 0x201b #define MEDIA_BUS_FMT_YUYV10_1X20 0x200d #define MEDIA_BUS_FMT_YVYU10_1X20 0x200e -#define MEDIA_BUS_FMT_YUV8_1X24 0x2024 +#define MEDIA_BUS_FMT_YUV8_1X24 0x2025 #define MEDIA_BUS_FMT_YUV10_1X30 0x2016 #define MEDIA_BUS_FMT_AYUV8_1X32 0x2017 #define MEDIA_BUS_FMT_UYVY12_2X12 0x201c -- cgit v1.2.3