From 1549210fcc17e9ae20c09ac8cd4c48a8dfd431bd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 09:16:47 -0400 Subject: NFSv4: Fix an Oops in the open recovery code The open recovery code does not need to request a new value for the mdsthreshold, and so does not allocate a struct nfs4_threshold. The problem is that encode_getfattr_open() will still request an mdsthreshold, and so we end up Oopsing in decode_attr_mdsthreshold. This patch fixes encode_getfattr_open so that it doesn't request an mdsthreshold when the caller isn't asking for one. It also fixes decode_attr_mdsthreshold so that it errors if the server returns an mdsthreshold that we didn't ask for (instead of Oopsing). Signed-off-by: Trond Myklebust Cc: Andy Adamson --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d1a7bf51c326..7519baef025b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -348,6 +348,7 @@ struct nfs_openargs { const struct qstr * name; const struct nfs_server *server; /* Needed for ID mapping */ const u32 * bitmask; + const u32 * open_bitmap; __u32 claim; struct nfs4_sequence_args seq_args; }; -- cgit v1.2.3 From 9bce008bae8b57bc7b007bcc2071d1247a527120 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 18:32:03 -0400 Subject: NFS: Fix a commit bug The new commit code fails to copy the verifier into the wb_verf field of _all_ the nfs_page structures; it only copies it into the first entry. The consequence is that most requests end up failing to match in nfs_commit_release. Fix is to copy the verifier into the req->wb_verf field in nfs_write_completion. Signed-off-by: Trond Myklebust Cc: Fred Isaman --- fs/nfs/direct.c | 4 ++-- fs/nfs/write.c | 7 ++++--- include/linux/nfs_xdr.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 23d170bc44f4..b5385a7efd56 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -710,12 +710,12 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) bit = NFS_IOHDR_NEED_RESCHED; else if (dreq->flags == 0) { - memcpy(&dreq->verf, &req->wb_verf, + memcpy(&dreq->verf, hdr->verf, sizeof(dreq->verf)); bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (memcmp(&dreq->verf, &req->wb_verf, sizeof(dreq->verf))) { + if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; bit = NFS_IOHDR_NEED_RESCHED; } else diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e6fe3d69d14c..4d6861c0dc14 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -80,6 +80,7 @@ struct nfs_write_header *nfs_writehdr_alloc(void) INIT_LIST_HEAD(&hdr->rpc_list); spin_lock_init(&hdr->lock); atomic_set(&hdr->refcnt, 0); + hdr->verf = &p->verf; } return p; } @@ -619,6 +620,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + memcpy(&req->wb_verf, hdr->verf, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -1255,15 +1257,14 @@ static void nfs_writeback_release_common(void *calldata) struct nfs_write_data *data = calldata; struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; - struct nfs_page *req = hdr->req; if ((status >= 0) && nfs_write_need_commit(data)) { spin_lock(&hdr->lock); if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) ; /* Do nothing */ else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) + memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); + else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); spin_unlock(&hdr->lock); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7519baef025b..8aadd90b808a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1237,6 +1237,7 @@ struct nfs_pgio_header { struct list_head rpc_list; atomic_t refcnt; struct nfs_page *req; + struct nfs_writeverf *verf; struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; @@ -1274,6 +1275,7 @@ struct nfs_write_data { struct nfs_write_header { struct nfs_pgio_header header; struct nfs_write_data rpc_data; + struct nfs_writeverf verf; }; struct nfs_mds_commit_info { -- cgit v1.2.3 From 55432d2b543a4b6dfae54f5c432a566877a85d90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Jun 2012 03:00:18 +0000 Subject: inetpeer: fix a race in inetpeer_gc_worker() commit 5faa5df1fa2024 (inetpeer: Invalidate the inetpeer tree along with the routing cache) added a race : Before freeing an inetpeer, we must respect a RCU grace period, and make sure no user will attempt to increase refcnt. inetpeer_invalidate_tree() waits for a RCU grace period before inserting inetpeer tree into gc_list and waking the worker. At that time, no concurrent lookup can find a inetpeer in this tree. Signed-off-by: Eric Dumazet Cc: Steffen Klassert Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/inetpeer.h | 5 ++++- net/ipv4/inetpeer.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index b94765e38e80..2040bff945d4 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -40,7 +40,10 @@ struct inet_peer { u32 pmtu_orig; u32 pmtu_learned; struct inetpeer_addr_base redirect_learned; - struct list_head gc_list; + union { + struct list_head gc_list; + struct rcu_head gc_rcu; + }; /* * Once inet_peer is queued for deletion (refcnt == -1), following fields * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d4d61b694fab..dfba343b2509 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -560,6 +560,17 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) } EXPORT_SYMBOL(inet_peer_xrlim_allow); +static void inetpeer_inval_rcu(struct rcu_head *head) +{ + struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); + + spin_lock_bh(&gc_lock); + list_add_tail(&p->gc_list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} + void inetpeer_invalidate_tree(int family) { struct inet_peer *old, *new, *prev; @@ -576,10 +587,7 @@ void inetpeer_invalidate_tree(int family) prev = cmpxchg(&base->root, old, new); if (prev == old) { base->total = 0; - spin_lock(&gc_lock); - list_add_tail(&prev->gc_list, &gc_list); - spin_unlock(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); } out: -- cgit v1.2.3 From fd4b352687fd8604d49c190c4c9ea9e369fd42d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 May 2012 19:10:35 -0700 Subject: rcu: Update RCU_FAST_NO_HZ tracing for lazy callbacks In the current code, a short dyntick-idle interval (where there is at least one non-lazy callback on the CPU) and a long dyntick-idle interval (where there are only lazy callbacks on the CPU) are traced identically, which can be less than helpful. This commit therefore emits different event traces in these two cases. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- include/trace/events/rcu.h | 1 + kernel/rcutree_plugin.h | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 1480900c511c..d274734b2aa4 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -289,6 +289,7 @@ TRACE_EVENT(rcu_dyntick, * "In holdoff": Nothing to do, holding off after unsuccessful attempt. * "Begin holdoff": Attempt failed, don't retry until next jiffy. * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. + * "Dyntick with lazy callbacks": Entering dyntick-idle w/lazy callbacks. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! * "Timer": Timer fired to cause CPU to continue processing callbacks. diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..5449f02c4820 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2165,15 +2165,17 @@ static void rcu_prepare_for_idle(int cpu) !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_GP_DELAY; - else + } else { per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_LAZY_GP_DELAY; + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } tp = &per_cpu(rcu_idle_gp_timer, cpu); mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); per_cpu(rcu_nonlazy_posted_snap, cpu) = -- cgit v1.2.3 From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 16:41:44 -0700 Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick() calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the next wakeup time based on the timer wheels. Only later, when actually entering the idle loop, rcu_prepare_for_idle() will be invoked. In some cases, rcu_prepare_for_idle() will post timers to wake the CPU back up. But all for naught: The next wakeup time for the CPU has already been computed, and posting a timer afterwards does not force that wakeup time to be recomputed. This means that rcu_prepare_for_idle()'s have no effect. This is not a problem on a busy system because something else will wake up the CPU soon enough. However, on lightly loaded systems, the CPU might stay asleep for a considerable length of time. If that CPU has a callback that the rest of the system is waiting on, the system might run very slowly or (in theory) even hang. This commit avoids this problem by having rcu_needs_cpu() give tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU to wake back up, which tick_nohz_stop_sched_tick() takes into account when programming the CPU's wakeup time. An alternative approach is for rcu_prepare_for_idle() to use hrtimers instead of normal timers, but timers are much more efficient than are hrtimers for frequently and repeatedly posting and cancelling a given timer, which is exactly what RCU_FAST_NO_HZ does. Reported-by: Pascal Chapperon Reported-by: Heiko Carstens Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- include/linux/rcutiny.h | 6 +++-- include/linux/rcutree.h | 2 +- kernel/rcutree_plugin.h | 66 +++++++++++++++++++++++++++++++----------------- kernel/time/tick-sched.c | 7 ++++- 4 files changed, 54 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index adb5e5a38cae..854dc4c5c271 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,8 +87,9 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU -static inline int rcu_needs_cpu(int cpu) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return 0; } @@ -96,8 +97,9 @@ static inline int rcu_needs_cpu(int cpu) int rcu_preempt_needs_cpu(void); -static inline int rcu_needs_cpu(int cpu) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_preempt_needs_cpu(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3c6083cde4fc..952b79339304 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -32,7 +32,7 @@ extern void rcu_init(void); extern void rcu_note_context_switch(int cpu); -extern int rcu_needs_cpu(int cpu); +extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies); extern void rcu_cpu_stall_reset(void); /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6bd9637d5d83..5271a020887e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1962,28 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return rdtp->dyntick_holdoff == jiffies; -} - /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2026,6 +2005,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) rcu_preempt_cpu_has_nonlazy_callbacks(cpu); } +/* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + *delta_jiffies = RCU_IDLE_GP_DELAY; + else + *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + return 0; +} + /* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..52f5ebbd443b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off -- cgit v1.2.3 From d1992b169d31f339dc5ea4e9f312567c8cf322a3 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Thu, 17 May 2012 22:35:46 +0000 Subject: netfilter: xt_HMARK: fix endianness and provide consistent hashing This patch addresses two issues: a) Fix usage of u32 and __be32 that causes endianess warnings via sparse. b) Ensure consistent hashing in a cluster that is composed of big and little endian systems. Thus, we obtain the same hash mark in an heterogeneous cluster. Reported-by: Dan Carpenter Signed-off-by: Hans Schillstrom Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_HMARK.h | 5 +++ net/netfilter/xt_HMARK.c | 72 ++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h index abb1650940d2..826fc5807577 100644 --- a/include/linux/netfilter/xt_HMARK.h +++ b/include/linux/netfilter/xt_HMARK.h @@ -27,7 +27,12 @@ union hmark_ports { __u16 src; __u16 dst; } p16; + struct { + __be16 src; + __be16 dst; + } b16; __u32 v32; + __be32 b32; }; struct xt_hmark_info { diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 0a96a43108ed..1686ca1b53a1 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -32,13 +32,13 @@ MODULE_ALIAS("ipt_HMARK"); MODULE_ALIAS("ip6t_HMARK"); struct hmark_tuple { - u32 src; - u32 dst; + __be32 src; + __be32 dst; union hmark_ports uports; - uint8_t proto; + u8 proto; }; -static inline u32 hmark_addr6_mask(const __u32 *addr32, const __u32 *mask) +static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask) { return (addr32[0] & mask[0]) ^ (addr32[1] & mask[1]) ^ @@ -46,8 +46,8 @@ static inline u32 hmark_addr6_mask(const __u32 *addr32, const __u32 *mask) (addr32[3] & mask[3]); } -static inline u32 -hmark_addr_mask(int l3num, const __u32 *addr32, const __u32 *mask) +static inline __be32 +hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask) { switch (l3num) { case AF_INET: @@ -58,6 +58,22 @@ hmark_addr_mask(int l3num, const __u32 *addr32, const __u32 *mask) return 0; } +static inline void hmark_swap_ports(union hmark_ports *uports, + const struct xt_hmark_info *info) +{ + union hmark_ports hp; + u16 src, dst; + + hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32; + src = ntohs(hp.b16.src); + dst = ntohs(hp.b16.dst); + + if (dst > src) + uports->v32 = (dst << 16) | src; + else + uports->v32 = (src << 16) | dst; +} + static int hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, const struct xt_hmark_info *info) @@ -74,22 +90,19 @@ hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.all, - info->src_mask.all); - t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.all, - info->dst_mask.all); + t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6, + info->src_mask.ip6); + t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6, + info->dst_mask.ip6); if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; t->proto = nf_ct_protonum(ct); if (t->proto != IPPROTO_ICMP) { - t->uports.p16.src = otuple->src.u.all; - t->uports.p16.dst = rtuple->src.u.all; - t->uports.v32 = (t->uports.v32 & info->port_mask.v32) | - info->port_set.v32; - if (t->uports.p16.dst < t->uports.p16.src) - swap(t->uports.p16.dst, t->uports.p16.src); + t->uports.b16.src = otuple->src.u.all; + t->uports.b16.dst = rtuple->src.u.all; + hmark_swap_ports(&t->uports, info); } return 0; @@ -98,15 +111,19 @@ hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, #endif } +/* This hash function is endian independent, to ensure consistent hashing if + * the cluster is composed of big and little endian systems. */ static inline u32 hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) { u32 hash; + u32 src = ntohl(t->src); + u32 dst = ntohl(t->dst); - if (t->dst < t->src) - swap(t->src, t->dst); + if (dst < src) + swap(src, dst); - hash = jhash_3words(t->src, t->dst, t->uports.v32, info->hashrnd); + hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); hash = hash ^ (t->proto & info->proto_mask); return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; @@ -126,11 +143,7 @@ hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff, if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0) return; - t->uports.v32 = (t->uports.v32 & info->port_mask.v32) | - info->port_set.v32; - - if (t->uports.p16.dst < t->uports.p16.src) - swap(t->uports.p16.dst, t->uports.p16.src); + hmark_swap_ports(&t->uports, info); } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -178,8 +191,8 @@ hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t, return -1; } noicmp: - t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.all); - t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.all); + t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6); + t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6); if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; @@ -255,11 +268,8 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, } } - t->src = (__force u32) ip->saddr; - t->dst = (__force u32) ip->daddr; - - t->src &= info->src_mask.ip; - t->dst &= info->dst_mask.ip; + t->src = ip->saddr & info->src_mask.ip; + t->dst = ip->daddr & info->dst_mask.ip; if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; -- cgit v1.2.3 From c8e9cf7bb240049117d2fa64d1540476c289396d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 7 Jun 2012 12:15:15 +0200 Subject: vga_switcheroo: Add a helper function to get the client state Add vga_switcheroo_get_client_state() to get the current state of the client. This is necessary to determine the proper initial state of audio clients in HD-audio driver. Acked-by: Dave Airlie Signed-off-by: Takashi Iwai --- drivers/gpu/vga/vga_switcheroo.c | 13 +++++++++++++ include/linux/vga_switcheroo.h | 7 +++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c index 38f9534ac513..eb4f64f0a565 100644 --- a/drivers/gpu/vga/vga_switcheroo.c +++ b/drivers/gpu/vga/vga_switcheroo.c @@ -190,6 +190,19 @@ find_active_client(struct list_head *head) return NULL; } +int vga_switcheroo_get_client_state(struct pci_dev *pdev) +{ + struct vga_switcheroo_client *client; + + client = find_client_from_pci(&vgasr_priv.clients, pdev); + if (!client) + return VGA_SWITCHEROO_NOT_FOUND; + if (!vgasr_priv.active) + return VGA_SWITCHEROO_INIT; + return client->pwr_state; +} +EXPORT_SYMBOL(vga_switcheroo_get_client_state); + void vga_switcheroo_unregister_client(struct pci_dev *pdev) { struct vga_switcheroo_client *client; diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h index b455c7c212eb..b176342ca031 100644 --- a/include/linux/vga_switcheroo.h +++ b/include/linux/vga_switcheroo.h @@ -12,6 +12,9 @@ enum vga_switcheroo_state { VGA_SWITCHEROO_OFF, VGA_SWITCHEROO_ON, + /* below are referred only from vga_switcheroo_get_client_state() */ + VGA_SWITCHEROO_INIT, + VGA_SWITCHEROO_NOT_FOUND, }; enum vga_switcheroo_client_id { @@ -50,6 +53,8 @@ void vga_switcheroo_unregister_handler(void); int vga_switcheroo_process_delayed_switch(void); +int vga_switcheroo_get_client_state(struct pci_dev *dev); + #else static inline void vga_switcheroo_unregister_client(struct pci_dev *dev) {} @@ -62,5 +67,7 @@ static inline int vga_switcheroo_register_audio_client(struct pci_dev *pdev, int id, bool active) { return 0; } static inline void vga_switcheroo_unregister_handler(void) {} static inline int vga_switcheroo_process_delayed_switch(void) { return 0; } +static inline int vga_switcheroo_get_client_state(struct pci_dev *dev) { return VGA_SWITCHEROO_CLIENT_ON; } + #endif -- cgit v1.2.3 From 505cff00de9c303b95c204eb4544066e3e707911 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 8 Jun 2012 12:49:17 +0200 Subject: vga_switcheroo: Fix error without CONFIG_VGA_SWITCHEROO Fix a typo that is built only when CONFIG_VGA_SWITCHEROO=n. Signed-off-by: Takashi Iwai --- include/linux/vga_switcheroo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h index b176342ca031..60da41fe9dc2 100644 --- a/include/linux/vga_switcheroo.h +++ b/include/linux/vga_switcheroo.h @@ -67,7 +67,7 @@ static inline int vga_switcheroo_register_audio_client(struct pci_dev *pdev, int id, bool active) { return 0; } static inline void vga_switcheroo_unregister_handler(void) {} static inline int vga_switcheroo_process_delayed_switch(void) { return 0; } -static inline int vga_switcheroo_get_client_state(struct pci_dev *dev) { return VGA_SWITCHEROO_CLIENT_ON; } +static inline int vga_switcheroo_get_client_state(struct pci_dev *dev) { return VGA_SWITCHEROO_ON; } #endif -- cgit v1.2.3 From 8876d6b5f81f4e242a6660da22bbd92f17a8d058 Mon Sep 17 00:00:00 2001 From: Paul Pluzhnikov Date: Sat, 9 Jun 2012 07:53:03 -0700 Subject: net: Make linux/tcp.h C++ friendly (trivial) I originally sent this patch to , but Jiri Kosina did not feel that this is fully appropriate for the trivial tree. Using linux/tcp.h from C++ results in: cat t.cc #include int main() { } g++ -c t.cc In file included from t.cc:1: /usr/include/linux/tcp.h:72: error: '__u32 __fswab32(__u32)' cannot appear in a constant-expression /usr/include/linux/tcp.h:72: error: a function call cannot appear in a constant-expression ... Attached trivial patch fixes this problem. Tested: - the t.cc above compiles with g++ and - the following program generates the same output before/after the patch: #include #include int main () { #define P(a) printf("%s: %08x\n", #a, (int)a) P(TCP_FLAG_CWR); P(TCP_FLAG_ECE); P(TCP_FLAG_URG); P(TCP_FLAG_ACK); P(TCP_FLAG_PSH); P(TCP_FLAG_RST); P(TCP_FLAG_SYN); P(TCP_FLAG_FIN); P(TCP_RESERVED_BITS); P(TCP_DATA_OFFSET); #undef P return 0; } Signed-off-by: Paul Pluzhnikov Signed-off-by: David S. Miller --- include/linux/tcp.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4c5b63283377..5f359dbfcdce 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -69,16 +69,16 @@ union tcp_word_hdr { #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) enum { - TCP_FLAG_CWR = __cpu_to_be32(0x00800000), - TCP_FLAG_ECE = __cpu_to_be32(0x00400000), - TCP_FLAG_URG = __cpu_to_be32(0x00200000), - TCP_FLAG_ACK = __cpu_to_be32(0x00100000), - TCP_FLAG_PSH = __cpu_to_be32(0x00080000), - TCP_FLAG_RST = __cpu_to_be32(0x00040000), - TCP_FLAG_SYN = __cpu_to_be32(0x00020000), - TCP_FLAG_FIN = __cpu_to_be32(0x00010000), - TCP_RESERVED_BITS = __cpu_to_be32(0x0F000000), - TCP_DATA_OFFSET = __cpu_to_be32(0xF0000000) + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) }; /* -- cgit v1.2.3 From 3777808873b0c49c5cf27e44c948dfb02675d578 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 11 Jun 2012 14:29:58 +0900 Subject: bug.h: need linux/kernel.h for TAINT_WARN. asm-generic/bug.h uses taint flags that are only defined in linux/kernel.h, resulting in build failures on platforms that don't include linux/kernel.h some other way: arch/sh/include/asm/thread_info.h:172:2: error: 'TAINT_WARN' undeclared (first use in this function) Caused by commit edd63a2763bd ("set_restore_sigmask() is never called without SIGPENDING (and never should be)"). Reported-by: Stephen Rothwell Cc: Al Viro Signed-off-by: Paul Mundt --- include/asm-generic/bug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2520a6e241dc..9f02005f217a 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -2,6 +2,7 @@ #define _ASM_GENERIC_BUG_H #include +#include #ifdef CONFIG_BUG -- cgit v1.2.3 From c5d21c4b2a7765ab0600c8426374b50eb9f4a36f Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 10 Jun 2012 20:05:24 +0000 Subject: net: Reorder initialization in ip_route_output to fix gcc warning If I build with W=1, for every file that includes , I get the warning include/net/route.h: In function 'ip_route_output': include/net/route.h:135:3: warning: initialized field overwritten [-Woverride-init] include/net/route.h:135:3: warning: (near initialization for 'fl4') [-Woverride-init] (This is with "gcc (Debian 4.6.3-1) 4.6.3") A fix seems pretty trivial: move the initialization of .flowi4_tos earlier. As far as I can tell, this has no effect on code generation. Signed-off-by: Roland Dreier Signed-off-by: David S. Miller --- include/net/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index ed2b78e2375d..98705468ac03 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -130,9 +130,9 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, { struct flowi4 fl4 = { .flowi4_oif = oif, + .flowi4_tos = tos, .daddr = daddr, .saddr = saddr, - .flowi4_tos = tos, }; return ip_route_output_key(net, &fl4); } -- cgit v1.2.3 From fe4561680519019cc15d660862dce513ded2f3a7 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Tue, 12 Jun 2012 11:27:01 -0300 Subject: drm: increase DRM_OBJECT_MAX_PROPERTY to 24 Before Kernel 3.5, no one was checking for the return value of drm_connector_attach_property, so we never noticed that we were unable to create some properties. Commit "drm: WARN() when drm_connector_attach_property fails" added a WARN when we fail to create a property, and the transition from "connector properties" to "object properties" changed the warning message a little bit. On i915 machines with many TV connectors we hit the maximum number of properties (since each TV connector uses a lot of properties), so we get a few backtraces in our logs. This commit increases the maximum number of properties to 24 hoping we'll have enough room for everybody. Chris suggested that we convert this code to "lists", but I believe this conversion can come after we make sure people's dmesgs are not spammed by our driver. Signed-off-by: Paulo Zanoni Reported-by: Dave Jones Tested-by: Daniel Vetter Signed-off-by: Dave Airlie --- include/drm/drm_crtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index 73e45600f95d..bac55c215113 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -54,7 +54,7 @@ struct drm_mode_object { struct drm_object_properties *properties; }; -#define DRM_OBJECT_MAX_PROPERTY 16 +#define DRM_OBJECT_MAX_PROPERTY 24 struct drm_object_properties { int count; uint32_t ids[DRM_OBJECT_MAX_PROPERTY]; -- cgit v1.2.3 From 5ee31c6898ea5537fcea160999d60dc63bc0c305 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 06:03:51 +0000 Subject: bonding: Fix corrupted queue_mapping In the transmit path of the bonding driver, skb->cb is used to stash the skb->queue_mapping so that the bonding device can set its own queue mapping. This value becomes corrupted since the skb->cb is also used in __dev_xmit_skb. When transmitting through bonding driver, bond_select_queue is called from dev_queue_xmit. In bond_select_queue the original skb->queue_mapping is copied into skb->cb (via bond_queue_mapping) and skb->queue_mapping is overwritten with the bond driver queue. Subsequently in dev_queue_xmit, __dev_xmit_skb is called which writes the packet length into skb->cb, thereby overwriting the stashed queue mappping. In bond_dev_queue_xmit (called from hard_start_xmit), the queue mapping for the skb is set to the stashed value which is now the skb length and hence is an invalid queue for the slave device. If we want to save skb->queue_mapping into skb->cb[], best place is to add a field in struct qdisc_skb_cb, to make sure it wont conflict with other layers (eg : Qdiscc, Infiniband...) This patchs also makes sure (struct qdisc_skb_cb)->data is aligned on 8 bytes : netem qdisc for example assumes it can store an u64 in it, without misalignment penalty. Note : we only have 20 bytes left in (struct qdisc_skb_cb)->data[]. The largest user is CHOKe and it fills it. Based on a previous patch from Tom Herbert. Signed-off-by: Eric Dumazet Reported-by: Tom Herbert Cc: John Fastabend Cc: Roland Dreier Acked-by: Neil Horman Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 9 +++++---- include/net/sch_generic.h | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2ee8cf9e8a3b..b9c2ae62166d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -76,6 +76,7 @@ #include #include #include +#include #include "bonding.h" #include "bond_3ad.h" #include "bond_alb.h" @@ -381,8 +382,6 @@ struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr) return next; } -#define bond_queue_mapping(skb) (*(u16 *)((skb)->cb)) - /** * bond_dev_queue_xmit - Prepare skb for xmit. * @@ -395,7 +394,9 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, { skb->dev = slave_dev; - skb->queue_mapping = bond_queue_mapping(skb); + BUILD_BUG_ON(sizeof(skb->queue_mapping) != + sizeof(qdisc_skb_cb(skb)->bond_queue_mapping)); + skb->queue_mapping = qdisc_skb_cb(skb)->bond_queue_mapping; if (unlikely(netpoll_tx_running(slave_dev))) bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); @@ -4171,7 +4172,7 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb) /* * Save the original txq to restore before passing to the driver */ - bond_queue_mapping(skb) = skb->queue_mapping; + qdisc_skb_cb(skb)->bond_queue_mapping = skb->queue_mapping; if (unlikely(txq >= dev->real_num_tx_queues)) { do { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 55ce96b53b09..9d7d54a00e63 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -220,13 +220,16 @@ struct tcf_proto { struct qdisc_skb_cb { unsigned int pkt_len; - unsigned char data[24]; + u16 bond_queue_mapping; + u16 _pad; + unsigned char data[20]; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); + + BUILD_BUG_ON(sizeof(skb->cb) < offsetof(struct qdisc_skb_cb, data) + sz); BUILD_BUG_ON(sizeof(qcb->data) < sz); } -- cgit v1.2.3 From 4149268e7816d719b0fde8e89aaa6db8c168fc43 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 18 May 2012 13:57:19 -0700 Subject: target: Add TFO->put_session() caller for HW fabric session shutdown This patch adds an optional target_core_fabric_ops->put_session() caller within the existing target_put_session() code path. This is required by tcm_qla2xxx code in order to invoke it's own fabric specific session shutdown handler using se_session->sess_kref. Signed-off-by: Joern Engel Cc: Roland Dreier Cc: Arun Easi Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 8 +++++++- include/target/target_core_fabric.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index b05fdc0c05d3..634d0f31a28c 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -315,7 +315,7 @@ void transport_register_session( } EXPORT_SYMBOL(transport_register_session); -static void target_release_session(struct kref *kref) +void target_release_session(struct kref *kref) { struct se_session *se_sess = container_of(kref, struct se_session, sess_kref); @@ -332,6 +332,12 @@ EXPORT_SYMBOL(target_get_session); void target_put_session(struct se_session *se_sess) { + struct se_portal_group *tpg = se_sess->se_tpg; + + if (tpg->se_tpg_tfo->put_session != NULL) { + tpg->se_tpg_tfo->put_session(se_sess); + return; + } kref_put(&se_sess->sess_kref, target_release_session); } EXPORT_SYMBOL(target_put_session); diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 116959933f46..c78a23333c4f 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -47,6 +47,7 @@ struct target_core_fabric_ops { */ int (*check_stop_free)(struct se_cmd *); void (*release_cmd)(struct se_cmd *); + void (*put_session)(struct se_session *); /* * Called with spin_lock_bh(struct se_portal_group->session_lock held. */ -- cgit v1.2.3 From 9b15b817f3d62409290fd56fe3cbb076a931bb0a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 15 Jun 2012 17:55:50 -0700 Subject: swap: fix shmem swapping when more than 8 areas Minchan Kim reports that when a system has many swap areas, and tmpfs swaps out to the ninth or more, shmem_getpage_gfp()'s attempts to read back the page cannot locate it, and the read fails with -ENOMEM. Whoops. Yes, I blindly followed read_swap_header()'s pte_to_swp_entry( swp_entry_to_pte()) technique for determining maximum usable swap offset, without stopping to realize that that actually depends upon the pte swap encoding shifting swap offset to the higher bits and truncating it there. Whereas our radix_tree swap encoding leaves offset in the lower bits: it's swap "type" (that is, index of swap area) that was truncated. Fix it by reducing the SWP_TYPE_SHIFT() in swapops.h, and removing the broken radix_to_swp_entry(swp_to_radix_entry()) from read_swap_header(). This does not reduce the usable size of a swap area any further, it leaves it as claimed when making the original commit: no change from 3.0 on x86_64, nor on i386 without PAE; but 3.0's 512GB is reduced to 128GB per swapfile on i386 with PAE. It's not a change I would have risked five years ago, but with x86_64 supported for ten years, I believe it's appropriate now. Hmm, and what if some architecture implements its swap pte with offset encoded below type? That would equally break the maximum usable swap offset check. Happily, they all follow the same tradition of encoding offset above type, but I'll prepare a check on that for next. Reported-and-Reviewed-and-Tested-by: Minchan Kim Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org [3.1, 3.2, 3.3, 3.4] Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 8 +++++--- mm/swapfile.c | 12 ++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 792d16d9cbc7..47ead515c811 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -9,13 +9,15 @@ * get good packing density in that tree, so the index should be dense in * the low-order bits. * - * We arrange the `type' and `offset' fields so that `type' is at the five + * We arrange the `type' and `offset' fields so that `type' is at the seven * high-order bits of the swp_entry_t and `offset' is right-aligned in the - * remaining bits. + * remaining bits. Although `type' itself needs only five bits, we allow for + * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ -#define SWP_TYPE_SHIFT(e) (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT) +#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \ + (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT)) #define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1) /* diff --git a/mm/swapfile.c b/mm/swapfile.c index de5bc51c4a66..71373d03fcee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1916,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are three limiting factors: 1) the number + * device. There are two limiting factors: 1) the number * of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte as defined by the - * the different architectures, and 3) the number of free bits - * in an exceptional radix_tree entry. In order to find the + * different architectures. In order to find the * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. Then the same is done for a radix_tree entry. + * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))); - maxpages = swp_offset(radix_to_swp_entry( - swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; - + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ -- cgit v1.2.3