From a8d757ef076f0f95f13a918808824058de25b3eb Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 25 Aug 2011 15:58:03 +0200
Subject: perf events: Fix slow and broken cgroup context switch code

The current cgroup context switch code was incorrect leading
to bogus counts. Furthermore, as soon as there was an active
cgroup event on a CPU, the context switch cost on that CPU
would increase by a significant amount as demonstrated by a
simple ping/pong example:

 $ ./pong
 Both processes pinned to CPU1, running for 10s
 10684.51 ctxsw/s

Now start a cgroup perf stat:
 $ perf stat -e cycles,cycles -A -a -G test  -C 1 -- sleep 100

$ ./pong
 Both processes pinned to CPU1, running for 10s
 6674.61 ctxsw/s

That's a 37% penalty.

Note that pong is not even in the monitored cgroup.

The results shown by perf stat are bogus:
 $ perf stat -e cycles,cycles -A -a -G test  -C 1 -- sleep 100

 Performance counter stats for 'sleep 100':

 CPU1 <not counted> cycles   test
 CPU1 16,984,189,138 cycles  #    0.000 GHz

The second 'cycles' event should report a count @ CPU clock
(here 2.4GHz) as it is counting across all cgroups.

The patch below fixes the bogus accounting and bypasses any
cgroup switches in case the outgoing and incoming tasks are
in the same cgroup.

With this patch the same test now yields:
 $ ./pong
 Both processes pinned to CPU1, running for 10s
 10775.30 ctxsw/s

Start perf stat with cgroup:

 $ perf stat -e cycles,cycles -A -a -G test  -C 1 -- sleep 10

Run pong outside the cgroup:
 $ /pong
 Both processes pinned to CPU1, running for 10s
 10687.80 ctxsw/s

The penalty is now less than 2%.

And the results for perf stat are correct:

$ perf stat -e cycles,cycles -A -a -G test  -C 1 -- sleep 10

 Performance counter stats for 'sleep 10':

 CPU1 <not counted> cycles test #    0.000 GHz
 CPU1 23,933,981,448 cycles      #    0.000 GHz

Now perf stat reports the correct counts for
for the non cgroup event.

If we run pong inside the cgroup, then we also get the
correct counts:

$ perf stat -e cycles,cycles -A -a -G test  -C 1 -- sleep 10

 Performance counter stats for 'sleep 10':

 CPU1 22,297,726,205 cycles test #    0.000 GHz
 CPU1 23,933,981,448 cycles      #    0.000 GHz

      10.001457237 seconds time elapsed

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 245bafdafd5e..c816075c01ce 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,8 +944,10 @@ extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern int perf_num_counters(void);
 extern const char *perf_pmu_name(void);
-extern void __perf_event_task_sched_in(struct task_struct *task);
-extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
+extern void __perf_event_task_sched_in(struct task_struct *prev,
+				       struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *prev,
+					struct task_struct *next);
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
@@ -1059,17 +1061,20 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 
 extern struct jump_label_key perf_sched_events;
 
-static inline void perf_event_task_sched_in(struct task_struct *task)
+static inline void perf_event_task_sched_in(struct task_struct *prev,
+					    struct task_struct *task)
 {
 	if (static_branch(&perf_sched_events))
-		__perf_event_task_sched_in(task);
+		__perf_event_task_sched_in(prev, task);
 }
 
-static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+static inline void perf_event_task_sched_out(struct task_struct *prev,
+					     struct task_struct *next)
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
 
-	__perf_event_task_sched_out(task, next);
+	if (static_branch(&perf_sched_events))
+		__perf_event_task_sched_out(prev, next);
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -1139,10 +1144,11 @@ extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_task_tick(void);
 #else
 static inline void
-perf_event_task_sched_in(struct task_struct *task)			{ }
+perf_event_task_sched_in(struct task_struct *prev,
+			 struct task_struct *task)			{ }
 static inline void
-perf_event_task_sched_out(struct task_struct *task,
-			    struct task_struct *next)			{ }
+perf_event_task_sched_out(struct task_struct *prev,
+			  struct task_struct *next)			{ }
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
-- 
cgit v1.2.3


From 8efcc57dedfebc99c3cd39564e3fc47cd1a24b75 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 3 Aug 2011 18:04:29 +0900
Subject: mfd: Fix value of WM8994_CONFIGURE_GPIO

This needs to be an out of band value for the register and on this device
registers are 16 bit so we must shift left one to the 17th bit.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: stable@kernel.org
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/wm8994/pdata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h
index d12f8d635a81..97cf4f27d647 100644
--- a/include/linux/mfd/wm8994/pdata.h
+++ b/include/linux/mfd/wm8994/pdata.h
@@ -26,7 +26,7 @@ struct wm8994_ldo_pdata {
 	struct regulator_init_data *init_data;
 };
 
-#define WM8994_CONFIGURE_GPIO 0x8000
+#define WM8994_CONFIGURE_GPIO 0x10000
 
 #define WM8994_DRC_REGS 5
 #define WM8994_EQ_REGS  20
-- 
cgit v1.2.3


From bff747c58cf97bf4fc8b499ee0f419b59d6b226d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 8 Sep 2011 10:16:47 -0700
Subject: regulator: fix kernel-doc warning in consumer.h

Fix kernel-doc warning about internal/private data by marking it
as "private:" so that kernel-doc will ignore it.

  Warning(include/linux/regulator/consumer.h:128): No description found for parameter 'ret'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/regulator/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 26f6ea4444e3..b47771aa5718 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -123,7 +123,7 @@ struct regulator_bulk_data {
 	const char *supply;
 	struct regulator *consumer;
 
-	/* Internal use */
+	/* private: Internal use */
 	int ret;
 };
 
-- 
cgit v1.2.3


From 185efc0f9a1f2d6ad6d4782c5d9e529f3290567f Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Wed, 14 Sep 2011 16:21:58 -0700
Subject: memcg: Revert "memcg: add memory.vmscan_stat"

Revert the post-3.0 commit 82f9d486e59f5 ("memcg: add
memory.vmscan_stat").

The implementation of per-memcg reclaim statistics violates how memcg
hierarchies usually behave: hierarchically.

The reclaim statistics are accounted to child memcgs and the parent
hitting the limit, but not to hierarchy levels in between.  Usually,
hierarchical statistics are perfectly recursive, with each level
representing the sum of itself and all its children.

Since this exports statistics to userspace, this may lead to confusion
and problems with changing things after the release, so revert it now,
we can try again later.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memory.txt |  85 +------------------
 include/linux/memcontrol.h       |  19 -----
 include/linux/swap.h             |   6 ++
 mm/memcontrol.c                  | 172 ++-------------------------------------
 mm/vmscan.c                      |  39 ++-------
 5 files changed, 18 insertions(+), 303 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 6f3c598971fc..06eb6d957c83 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -380,7 +380,7 @@ will be charged as a new owner of it.
 
 5.2 stat file
 
-5.2.1 memory.stat file includes following statistics
+memory.stat file includes following statistics
 
 # per-memory cgroup local status
 cache		- # of bytes of page cache memory.
@@ -438,89 +438,6 @@ Note:
 	 file_mapped is accounted only when the memory cgroup is owner of page
 	 cache.)
 
-5.2.2 memory.vmscan_stat
-
-memory.vmscan_stat includes statistics information for memory scanning and
-freeing, reclaiming. The statistics shows memory scanning information since
-memory cgroup creation and can be reset to 0 by writing 0 as
-
- #echo 0 > ../memory.vmscan_stat
-
-This file contains following statistics.
-
-[param]_[file_or_anon]_pages_by_[reason]_[under_heararchy]
-[param]_elapsed_ns_by_[reason]_[under_hierarchy]
-
-For example,
-
-  scanned_file_pages_by_limit indicates the number of scanned
-  file pages at vmscan.
-
-Now, 3 parameters are supported
-
-  scanned - the number of pages scanned by vmscan
-  rotated - the number of pages activated at vmscan
-  freed   - the number of pages freed by vmscan
-
-If "rotated" is high against scanned/freed, the memcg seems busy.
-
-Now, 2 reason are supported
-
-  limit - the memory cgroup's limit
-  system - global memory pressure + softlimit
-           (global memory pressure not under softlimit is not handled now)
-
-When under_hierarchy is added in the tail, the number indicates the
-total memcg scan of its children and itself.
-
-elapsed_ns is a elapsed time in nanosecond. This may include sleep time
-and not indicates CPU usage. So, please take this as just showing
-latency.
-
-Here is an example.
-
-# cat /cgroup/memory/A/memory.vmscan_stat
-scanned_pages_by_limit 9471864
-scanned_anon_pages_by_limit 6640629
-scanned_file_pages_by_limit 2831235
-rotated_pages_by_limit 4243974
-rotated_anon_pages_by_limit 3971968
-rotated_file_pages_by_limit 272006
-freed_pages_by_limit 2318492
-freed_anon_pages_by_limit 962052
-freed_file_pages_by_limit 1356440
-elapsed_ns_by_limit 351386416101
-scanned_pages_by_system 0
-scanned_anon_pages_by_system 0
-scanned_file_pages_by_system 0
-rotated_pages_by_system 0
-rotated_anon_pages_by_system 0
-rotated_file_pages_by_system 0
-freed_pages_by_system 0
-freed_anon_pages_by_system 0
-freed_file_pages_by_system 0
-elapsed_ns_by_system 0
-scanned_pages_by_limit_under_hierarchy 9471864
-scanned_anon_pages_by_limit_under_hierarchy 6640629
-scanned_file_pages_by_limit_under_hierarchy 2831235
-rotated_pages_by_limit_under_hierarchy 4243974
-rotated_anon_pages_by_limit_under_hierarchy 3971968
-rotated_file_pages_by_limit_under_hierarchy 272006
-freed_pages_by_limit_under_hierarchy 2318492
-freed_anon_pages_by_limit_under_hierarchy 962052
-freed_file_pages_by_limit_under_hierarchy 1356440
-elapsed_ns_by_limit_under_hierarchy 351386416101
-scanned_pages_by_system_under_hierarchy 0
-scanned_anon_pages_by_system_under_hierarchy 0
-scanned_file_pages_by_system_under_hierarchy 0
-rotated_pages_by_system_under_hierarchy 0
-rotated_anon_pages_by_system_under_hierarchy 0
-rotated_file_pages_by_system_under_hierarchy 0
-freed_pages_by_system_under_hierarchy 0
-freed_anon_pages_by_system_under_hierarchy 0
-freed_file_pages_by_system_under_hierarchy 0
-elapsed_ns_by_system_under_hierarchy 0
-
 5.3 swappiness
 
 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3b535db00a94..343bd7661f2a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -39,16 +39,6 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct mem_cgroup *mem_cont,
 					int active, int file);
 
-struct memcg_scanrecord {
-	struct mem_cgroup *mem; /* scanend memory cgroup */
-	struct mem_cgroup *root; /* scan target hierarchy root */
-	int context;		/* scanning context (see memcontrol.c) */
-	unsigned long nr_scanned[2]; /* the number of scanned pages */
-	unsigned long nr_rotated[2]; /* the number of rotated pages */
-	unsigned long nr_freed[2]; /* the number of freed pages */
-	unsigned long elapsed; /* nsec of time elapsed while scanning */
-};
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -127,15 +117,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
 
-extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-						  gfp_t gfp_mask, bool noswap,
-						  struct memcg_scanrecord *rec);
-extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
-						gfp_t gfp_mask, bool noswap,
-						struct zone *zone,
-						struct memcg_scanrecord *rec,
-						unsigned long *nr_scanned);
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 14d62490922e..c71f84bb62ec 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -252,6 +252,12 @@ static inline void lru_cache_add_file(struct page *page)
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
+extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
+						  gfp_t gfp_mask, bool noswap);
+extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+						gfp_t gfp_mask, bool noswap,
+						struct zone *zone,
+						unsigned long *nr_scanned);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ebd1e86bef1c..3508777837c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list {
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 
-enum {
-	SCAN_BY_LIMIT,
-	SCAN_BY_SYSTEM,
-	NR_SCAN_CONTEXT,
-	SCAN_BY_SHRINK,	/* not recorded now */
-};
-
-enum {
-	SCAN,
-	SCAN_ANON,
-	SCAN_FILE,
-	ROTATE,
-	ROTATE_ANON,
-	ROTATE_FILE,
-	FREED,
-	FREED_ANON,
-	FREED_FILE,
-	ELAPSED,
-	NR_SCANSTATS,
-};
-
-struct scanstat {
-	spinlock_t	lock;
-	unsigned long	stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
-	unsigned long	rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
-};
-
-const char *scanstat_string[NR_SCANSTATS] = {
-	"scanned_pages",
-	"scanned_anon_pages",
-	"scanned_file_pages",
-	"rotated_pages",
-	"rotated_anon_pages",
-	"rotated_file_pages",
-	"freed_pages",
-	"freed_anon_pages",
-	"freed_file_pages",
-	"elapsed_ns",
-};
-#define SCANSTAT_WORD_LIMIT	"_by_limit"
-#define SCANSTAT_WORD_SYSTEM	"_by_system"
-#define SCANSTAT_WORD_HIERARCHY	"_under_hierarchy"
-
-
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -313,8 +269,7 @@ struct mem_cgroup {
 
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
-	/* For recording LRU-scan statistics */
-	struct scanstat scanstat;
+
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
@@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
 }
 #endif
 
-static void __mem_cgroup_record_scanstat(unsigned long *stats,
-			   struct memcg_scanrecord *rec)
-{
-
-	stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
-	stats[SCAN_ANON] += rec->nr_scanned[0];
-	stats[SCAN_FILE] += rec->nr_scanned[1];
-
-	stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
-	stats[ROTATE_ANON] += rec->nr_rotated[0];
-	stats[ROTATE_FILE] += rec->nr_rotated[1];
-
-	stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
-	stats[FREED_ANON] += rec->nr_freed[0];
-	stats[FREED_FILE] += rec->nr_freed[1];
-
-	stats[ELAPSED] += rec->elapsed;
-}
-
-static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
-{
-	struct mem_cgroup *mem;
-	int context = rec->context;
-
-	if (context >= NR_SCAN_CONTEXT)
-		return;
-
-	mem = rec->mem;
-	spin_lock(&mem->scanstat.lock);
-	__mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
-	spin_unlock(&mem->scanstat.lock);
-
-	mem = rec->root;
-	spin_lock(&mem->scanstat.lock);
-	__mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
-	spin_unlock(&mem->scanstat.lock);
-}
-
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
-	struct memcg_scanrecord rec;
 	unsigned long excess;
-	unsigned long scanned;
+	unsigned long nr_scanned;
 
 	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
 
@@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	if (!check_soft && !shrink && root_mem->memsw_is_minimum)
 		noswap = true;
 
-	if (shrink)
-		rec.context = SCAN_BY_SHRINK;
-	else if (check_soft)
-		rec.context = SCAN_BY_SYSTEM;
-	else
-		rec.context = SCAN_BY_LIMIT;
-
-	rec.root = root_mem;
-
 	while (1) {
 		victim = mem_cgroup_select_victim(root_mem);
 		if (victim == root_mem) {
@@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 			css_put(&victim->css);
 			continue;
 		}
-		rec.mem = victim;
-		rec.nr_scanned[0] = 0;
-		rec.nr_scanned[1] = 0;
-		rec.nr_rotated[0] = 0;
-		rec.nr_rotated[1] = 0;
-		rec.nr_freed[0] = 0;
-		rec.nr_freed[1] = 0;
-		rec.elapsed = 0;
 		/* we use swappiness of local cgroup */
 		if (check_soft) {
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-				noswap, zone, &rec, &scanned);
-			*total_scanned += scanned;
+				noswap, zone, &nr_scanned);
+			*total_scanned += nr_scanned;
 		} else
 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-						noswap, &rec);
-		mem_cgroup_record_scanstat(&rec);
+						noswap);
 		css_put(&victim->css);
 		/*
 		 * At shrinking usage, we can't check we should stop here or
@@ -3854,18 +3752,14 @@ try_to_free:
 	/* try to free all pages in this cgroup */
 	shrink = 1;
 	while (nr_retries && mem->res.usage > 0) {
-		struct memcg_scanrecord rec;
 		int progress;
 
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
 		}
-		rec.context = SCAN_BY_SHRINK;
-		rec.mem = mem;
-		rec.root = mem;
 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
-						false, &rec);
+						false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -4709,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 }
 #endif /* CONFIG_NUMA */
 
-static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
-				struct cftype *cft,
-				struct cgroup_map_cb *cb)
-{
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
-	char string[64];
-	int i;
-
-	for (i = 0; i < NR_SCANSTATS; i++) {
-		strcpy(string, scanstat_string[i]);
-		strcat(string, SCANSTAT_WORD_LIMIT);
-		cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_LIMIT][i]);
-	}
-
-	for (i = 0; i < NR_SCANSTATS; i++) {
-		strcpy(string, scanstat_string[i]);
-		strcat(string, SCANSTAT_WORD_SYSTEM);
-		cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
-	}
-
-	for (i = 0; i < NR_SCANSTATS; i++) {
-		strcpy(string, scanstat_string[i]);
-		strcat(string, SCANSTAT_WORD_LIMIT);
-		strcat(string, SCANSTAT_WORD_HIERARCHY);
-		cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
-	}
-	for (i = 0; i < NR_SCANSTATS; i++) {
-		strcpy(string, scanstat_string[i]);
-		strcat(string, SCANSTAT_WORD_SYSTEM);
-		strcat(string, SCANSTAT_WORD_HIERARCHY);
-		cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
-	}
-	return 0;
-}
-
-static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
-				unsigned int event)
-{
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
-
-	spin_lock(&mem->scanstat.lock);
-	memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
-	memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
-	spin_unlock(&mem->scanstat.lock);
-	return 0;
-}
-
-
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -4827,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = {
 		.mode = S_IRUGO,
 	},
 #endif
-	{
-		.name = "vmscan_stat",
-		.read_map = mem_cgroup_vmscan_stat_read,
-		.trigger = mem_cgroup_reset_vmscan_stat,
-	},
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -5095,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
-	spin_lock_init(&mem->scanstat.lock);
 	return &mem->css;
 free_out:
 	__mem_cgroup_free(mem);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e49bcb6d4948..b55699cd9067 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
 
 	/* Which cgroup do we reclaim from */
 	struct mem_cgroup *mem_cgroup;
-	struct memcg_scanrecord *memcg_record;
 
 	/*
 	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
 			int file = is_file_lru(lru);
 			int numpages = hpage_nr_pages(page);
 			reclaim_stat->recent_rotated[file] += numpages;
-			if (!scanning_global_lru(sc))
-				sc->memcg_record->nr_rotated[file] += numpages;
 		}
 		if (!pagevec_add(&pvec, page)) {
 			spin_unlock_irq(&zone->lru_lock);
@@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
 
 	reclaim_stat->recent_scanned[0] += *nr_anon;
 	reclaim_stat->recent_scanned[1] += *nr_file;
-	if (!scanning_global_lru(sc)) {
-		sc->memcg_record->nr_scanned[0] += *nr_anon;
-		sc->memcg_record->nr_scanned[1] += *nr_file;
-	}
 }
 
 /*
@@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 		nr_reclaimed += shrink_page_list(&page_list, zone, sc);
 	}
 
-	if (!scanning_global_lru(sc))
-		sc->memcg_record->nr_freed[file] += nr_reclaimed;
-
 	local_irq_disable();
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	}
 
 	reclaim_stat->recent_scanned[file] += nr_taken;
-	if (!scanning_global_lru(sc))
-		sc->memcg_record->nr_scanned[file] += nr_taken;
 
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	if (file)
@@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * get_scan_ratio.
 	 */
 	reclaim_stat->recent_rotated[file] += nr_rotated;
-	if (!scanning_global_lru(sc))
-		sc->memcg_record->nr_rotated[file] += nr_rotated;
 
 	move_active_pages_to_lru(zone, &l_active,
 						LRU_ACTIVE + file * LRU_FILE);
@@ -2265,10 +2251,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
-					gfp_t gfp_mask, bool noswap,
-					struct zone *zone,
-					struct memcg_scanrecord *rec,
-					unsigned long *scanned)
+						gfp_t gfp_mask, bool noswap,
+						struct zone *zone,
+						unsigned long *nr_scanned)
 {
 	struct scan_control sc = {
 		.nr_scanned = 0,
@@ -2278,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 		.may_swap = !noswap,
 		.order = 0,
 		.mem_cgroup = mem,
-		.memcg_record = rec,
 	};
-	ktime_t start, end;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2289,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						      sc.may_writepage,
 						      sc.gfp_mask);
 
-	start = ktime_get();
 	/*
 	 * NOTE: Although we can get the priority field, using it
 	 * here is not a good idea, since it limits the pages we can scan.
@@ -2298,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 	 * the priority and make it zero.
 	 */
 	shrink_zone(0, zone, &sc);
-	end = ktime_get();
-
-	if (rec)
-		rec->elapsed += ktime_to_ns(ktime_sub(end, start));
-	*scanned = sc.nr_scanned;
 
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
+	*nr_scanned = sc.nr_scanned;
 	return sc.nr_reclaimed;
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 					   gfp_t gfp_mask,
-					   bool noswap,
-					   struct memcg_scanrecord *rec)
+					   bool noswap)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
-	ktime_t start, end;
 	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
@@ -2325,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.order = 0,
 		.mem_cgroup = mem_cont,
-		.memcg_record = rec,
 		.nodemask = NULL, /* we don't care the placement */
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2334,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.gfp_mask = sc.gfp_mask,
 	};
 
-	start = ktime_get();
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
 	 * take care of from where we get pages. So the node where we start the
@@ -2349,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 					    sc.gfp_mask);
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
-	end = ktime_get();
-	if (rec)
-		rec->elapsed += ktime_to_ns(ktime_sub(end, start));
 
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
-- 
cgit v1.2.3


From 4f5b04800a224aadb6cffcbbc3d3fa26e2367c7f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Wed, 14 Sep 2011 16:22:29 -0700
Subject: drivers/gpio/gpio-generic.c: fix build errors

Building a kernel with hotplug disabled results in a link failure:

  `bgpio_remove' referenced in section `___ksymtab_gpl+bgpio_remove' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o

This is because of bgpio_remove() is exported.  It is illegal to export
symbols which are discarded either at link time or as part of an
init/exit section.

Fix this by dropping the __devexit attributation from bgpio_remove().
Also drop the __devinit attributation from bgpio_init().

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/gpio-generic.c     | 15 +++++----------
 include/linux/basic_mmio_gpio.h | 15 +++++----------
 2 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-generic.c b/drivers/gpio/gpio-generic.c
index 231714def4d2..4e24436b0f82 100644
--- a/drivers/gpio/gpio-generic.c
+++ b/drivers/gpio/gpio-generic.c
@@ -351,7 +351,7 @@ static int bgpio_setup_direction(struct bgpio_chip *bgc,
 	return 0;
 }
 
-int __devexit bgpio_remove(struct bgpio_chip *bgc)
+int bgpio_remove(struct bgpio_chip *bgc)
 {
 	int err = gpiochip_remove(&bgc->gc);
 
@@ -361,15 +361,10 @@ int __devexit bgpio_remove(struct bgpio_chip *bgc)
 }
 EXPORT_SYMBOL_GPL(bgpio_remove);
 
-int __devinit bgpio_init(struct bgpio_chip *bgc,
-			 struct device *dev,
-			 unsigned long sz,
-			 void __iomem *dat,
-			 void __iomem *set,
-			 void __iomem *clr,
-			 void __iomem *dirout,
-			 void __iomem *dirin,
-			 bool big_endian)
+int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
+	       unsigned long sz, void __iomem *dat, void __iomem *set,
+	       void __iomem *clr, void __iomem *dirout, void __iomem *dirin,
+	       bool big_endian)
 {
 	int ret;
 
diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h
index 98999cf107ce..feb912196745 100644
--- a/include/linux/basic_mmio_gpio.h
+++ b/include/linux/basic_mmio_gpio.h
@@ -63,15 +63,10 @@ static inline struct bgpio_chip *to_bgpio_chip(struct gpio_chip *gc)
 	return container_of(gc, struct bgpio_chip, gc);
 }
 
-int __devexit bgpio_remove(struct bgpio_chip *bgc);
-int __devinit bgpio_init(struct bgpio_chip *bgc,
-			 struct device *dev,
-			 unsigned long sz,
-			 void __iomem *dat,
-			 void __iomem *set,
-			 void __iomem *clr,
-			 void __iomem *dirout,
-			 void __iomem *dirin,
-			 bool big_endian);
+int bgpio_remove(struct bgpio_chip *bgc);
+int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
+	       unsigned long sz, void __iomem *dat, void __iomem *set,
+	       void __iomem *clr, void __iomem *dirout, void __iomem *dirin,
+	       bool big_endian);
 
 #endif /* __BASIC_MMIO_GPIO_H */
-- 
cgit v1.2.3


From 946cedccbd7387488d2cee5da92cdfeb28d2e670 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 30 Aug 2011 03:21:44 +0000
Subject: tcp: Change possible SYN flooding messages

"Possible SYN flooding on port xxxx " messages can fill logs on servers.

Change logic to log the message only once per listener, and add two new
SNMP counters to track :

TCPReqQFullDoCookies : number of times a SYNCOOKIE was replied to client

TCPReqQFullDrop : number of times a SYN request was dropped because
syncookies were not enabled.

Based on a prior patch from Tom Herbert, and suggestions from David.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h       |  2 ++
 include/net/request_sock.h |  3 ++-
 include/net/tcp.h          |  3 +++
 net/ipv4/proc.c            |  2 ++
 net/ipv4/tcp_ipv4.c        | 49 ++++++++++++++++++++++++++--------------------
 net/ipv6/tcp_ipv6.c        | 31 +++--------------------------
 6 files changed, 40 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 12b2b18e50c1..e16557a357e5 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -231,6 +231,8 @@ enum
 	LINUX_MIB_TCPDEFERACCEPTDROP,
 	LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
 	LINUX_MIB_TCPTIMEWAITOVERFLOW,		/* TCPTimeWaitOverflow */
+	LINUX_MIB_TCPREQQFULLDOCOOKIES,		/* TCPReqQFullDoCookies */
+	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 99e6e19b57c2..4c0766e201e3 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -96,7 +96,8 @@ extern int sysctl_max_syn_backlog;
  */
 struct listen_sock {
 	u8			max_qlen_log;
-	/* 3 bytes hole, try to use */
+	u8			synflood_warned;
+	/* 2 bytes hole, try to use */
 	int			qlen;
 	int			qlen_young;
 	int			clock_hand;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 149a415d1e0a..e9b48b094683 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -460,6 +460,9 @@ extern int tcp_write_wakeup(struct sock *);
 extern void tcp_send_fin(struct sock *sk);
 extern void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 extern int tcp_send_synack(struct sock *);
+extern int tcp_syn_flood_action(struct sock *sk,
+				const struct sk_buff *skb,
+				const char *proto);
 extern void tcp_push_one(struct sock *, unsigned int mss_now);
 extern void tcp_send_ack(struct sock *sk);
 extern void tcp_send_delayed_ack(struct sock *sk);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b14ec7d03b6e..4bfad5da94f4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -254,6 +254,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
 	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
 	SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+	SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
+	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1c12b8ec849d..c34f01513945 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -808,20 +808,38 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 	kfree(inet_rsk(req)->opt);
 }
 
-static void syn_flood_warning(const struct sk_buff *skb)
+/*
+ * Return 1 if a syncookie should be sent
+ */
+int tcp_syn_flood_action(struct sock *sk,
+			 const struct sk_buff *skb,
+			 const char *proto)
 {
-	const char *msg;
+	const char *msg = "Dropping request";
+	int want_cookie = 0;
+	struct listen_sock *lopt;
+
+
 
 #ifdef CONFIG_SYN_COOKIES
-	if (sysctl_tcp_syncookies)
+	if (sysctl_tcp_syncookies) {
 		msg = "Sending cookies";
-	else
+		want_cookie = 1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+	} else
 #endif
-		msg = "Dropping request";
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 
-	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
-				ntohs(tcp_hdr(skb)->dest), msg);
+	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
+	if (!lopt->synflood_warned) {
+		lopt->synflood_warned = 1;
+		pr_info("%s: Possible SYN flooding on port %d. %s. "
+			" Check SNMP counters.\n",
+			proto, ntohs(tcp_hdr(skb)->dest), msg);
+	}
+	return want_cookie;
 }
+EXPORT_SYMBOL(tcp_syn_flood_action);
 
 /*
  * Save and compile IPv4 options into the request_sock if needed.
@@ -1235,11 +1253,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	__be32 saddr = ip_hdr(skb)->saddr;
 	__be32 daddr = ip_hdr(skb)->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
-#ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
-#else
-#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
-#endif
 
 	/* Never answer to SYNs send to broadcast or multicast */
 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1250,14 +1264,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * evidently real one.
 	 */
 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
-		if (net_ratelimit())
-			syn_flood_warning(skb);
-#ifdef CONFIG_SYN_COOKIES
-		if (sysctl_tcp_syncookies) {
-			want_cookie = 1;
-		} else
-#endif
-		goto drop;
+		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
+		if (!want_cookie)
+			goto drop;
 	}
 
 	/* Accept backlog is full. If we have already queued enough
@@ -1303,9 +1312,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		while (l-- > 0)
 			*c++ ^= *hash_location++;
 
-#ifdef CONFIG_SYN_COOKIES
 		want_cookie = 0;	/* not our kind of cookie */
-#endif
 		tmp_ext.cookie_out_never = 0; /* false */
 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
 	} else if (!tp->rx_opt.cookie_in_always) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d1fb63f4aeb7..3c9fa618b69d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -531,20 +531,6 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
 	return tcp_v6_send_synack(sk, req, rvp);
 }
 
-static inline void syn_flood_warning(struct sk_buff *skb)
-{
-#ifdef CONFIG_SYN_COOKIES
-	if (sysctl_tcp_syncookies)
-		printk(KERN_INFO
-		       "TCPv6: Possible SYN flooding on port %d. "
-		       "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest));
-	else
-#endif
-		printk(KERN_INFO
-		       "TCPv6: Possible SYN flooding on port %d. "
-		       "Dropping request.\n", ntohs(tcp_hdr(skb)->dest));
-}
-
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
 	kfree_skb(inet6_rsk(req)->pktopts);
@@ -1179,11 +1165,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 isn = TCP_SKB_CB(skb)->when;
 	struct dst_entry *dst = NULL;
-#ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
-#else
-#define want_cookie 0
-#endif
 
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_conn_request(sk, skb);
@@ -1192,14 +1174,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		goto drop;
 
 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
-		if (net_ratelimit())
-			syn_flood_warning(skb);
-#ifdef CONFIG_SYN_COOKIES
-		if (sysctl_tcp_syncookies)
-			want_cookie = 1;
-		else
-#endif
-		goto drop;
+		want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");
+		if (!want_cookie)
+			goto drop;
 	}
 
 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
@@ -1249,9 +1226,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		while (l-- > 0)
 			*c++ ^= *hash_location++;
 
-#ifdef CONFIG_SYN_COOKIES
 		want_cookie = 0;	/* not our kind of cookie */
-#endif
 		tmp_ext.cookie_out_never = 0; /* false */
 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
 	} else if (!tp->rx_opt.cookie_in_always) {
-- 
cgit v1.2.3


From 48c830120f2a20b44220aa26feda9ed15f49eaab Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 31 Aug 2011 08:03:29 +0000
Subject: net: copy userspace buffers on device forwarding

dev_forward_skb loops an skb back into host networking
stack which might hang on the memory indefinitely.
In particular, this can happen in macvtap in bridged mode.
Copy the userspace fragments to avoid blocking the
sender in that case.

As this patch makes skb_copy_ubufs extern now,
I also added some documentation and made it clear
the SKBTX_DEV_ZEROCOPY flag automatically instead
of doing it in all callers. This can be made into a separate
patch if people feel it's worth it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 +
 net/core/dev.c         |  8 ++++++++
 net/core/skbuff.c      | 22 +++++++++++++++++-----
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7b996ed86d5b..8bd383caa363 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -524,6 +524,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
 
 extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
+extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 extern struct sk_buff *skb_clone(struct sk_buff *skb,
 				 gfp_t priority);
 extern struct sk_buff *skb_copy(const struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 17d67b579beb..b10ff0a71855 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1515,6 +1515,14 @@ static inline bool is_skb_forwardable(struct net_device *dev,
  */
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 {
+	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+			atomic_long_inc(&dev->rx_dropped);
+			kfree_skb(skb);
+			return NET_RX_DROP;
+		}
+	}
+
 	skb_orphan(skb);
 	nf_reset(skb);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 27002dffe7ed..387703f56fce 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -611,8 +611,21 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
-/* skb frags copy userspace buffers to kernel */
-static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
+ *	@skb: the skb to modify
+ *	@gfp_mask: allocation priority
+ *
+ *	This must be called on SKBTX_DEV_ZEROCOPY skb.
+ *	It will copy all frags into kernel and drop the reference
+ *	to userspace pages.
+ *
+ *	If this function is called from an interrupt gfp_mask() must be
+ *	%GFP_ATOMIC.
+ *
+ *	Returns 0 on success or a negative error code on failure
+ *	to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	int i;
 	int num_frags = skb_shinfo(skb)->nr_frags;
@@ -652,6 +665,8 @@ static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 		skb_shinfo(skb)->frags[i - 1].page = head;
 		head = (struct page *)head->private;
 	}
+
+	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 	return 0;
 }
 
@@ -677,7 +692,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 		if (skb_copy_ubufs(skb, gfp_mask))
 			return NULL;
-		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 	}
 
 	n = skb + 1;
@@ -803,7 +817,6 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 				n = NULL;
 				goto out;
 			}
-			skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 		}
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
@@ -896,7 +909,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 			if (skb_copy_ubufs(skb, gfp_mask))
 				goto nofrags;
-			skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 		}
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			get_page(skb_shinfo(skb)->frags[i].page);
-- 
cgit v1.2.3