From b817525a4a80c04e4ca44192d97a1ffa9f2be572 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Oct 2015 14:47:05 -0400
Subject: writeback: bdi_writeback iteration must not skip dying ones

bdi_for_each_wb() is used in several places to wake up or issue
writeback work items to all wb's (bdi_writeback's) on a given bdi.
The iteration is performed by walking bdi->cgwb_tree; however, the
tree only indexes wb's which are currently active.

For example, when a memcg gets associated with a different blkcg, the
old wb is removed from the tree so that the new one can be indexed.
The old wb starts dying from then on but will linger till all its
inodes are drained.  As these dying wb's may still host dirty inodes,
writeback operations which affect all wb's must include them.
bdi_for_each_wb() skipping dying wb's led to sync(2) missing and
failing to sync the inodes belonging to those wb's.

This patch adds a RCU protected @bdi->wb_list which lists all wb's
beloinging to that bdi.  wb's are added on creation and removed on
release rather than on the start of destruction.  bdi_for_each_wb()
usages are replaced with list_for_each[_continue]_rcu() iterations
over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed.

v2: Updated as per Jan.  last_wb ref leak in bdi_split_work_to_wbs()
    fixed and unnecessary list head severing in cgwb_bdi_destroy()
    removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Artem Bityutskiy <dedekind1@gmail.com>
Fixes: ebe41ab0c79d ("writeback: implement bdi_for_each_wb()")
Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  3 ++
 include/linux/backing-dev.h      | 63 ----------------------------------------
 2 files changed, 3 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a23209b43842..1b4d69f68c33 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@ struct bdi_writeback {
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
 
+	struct list_head bdi_node;	/* anchored at bdi->wb_list */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct percpu_ref refcnt;	/* used only for !root wb's */
 	struct fprop_local_percpu memcg_completions;
@@ -150,6 +152,7 @@ struct backing_dev_info {
 	atomic_long_t tot_write_bandwidth;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
+	struct list_head wb_list; /* list of all wbs */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d5eb4ad1c534..78677e5a65bf 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -408,61 +408,6 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
 	rcu_read_unlock();
 }
 
-struct wb_iter {
-	int			start_memcg_id;
-	struct radix_tree_iter	tree_iter;
-	void			**slot;
-};
-
-static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
-						   struct backing_dev_info *bdi)
-{
-	struct radix_tree_iter *titer = &iter->tree_iter;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (iter->start_memcg_id >= 0) {
-		iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
-		iter->start_memcg_id = -1;
-	} else {
-		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
-	}
-
-	if (!iter->slot)
-		iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
-	if (iter->slot)
-		return *iter->slot;
-	return NULL;
-}
-
-static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
-						   struct backing_dev_info *bdi,
-						   int start_memcg_id)
-{
-	iter->start_memcg_id = start_memcg_id;
-
-	if (start_memcg_id)
-		return __wb_iter_next(iter, bdi);
-	else
-		return &bdi->wb;
-}
-
-/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- * @wb_cur: cursor struct bdi_writeback pointer
- * @bdi: bdi to walk wb's of
- * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_memcg_id: memcg ID to start iteration from
- *
- * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
- * to be used as temp storage during iteration.  rcu_read_lock() must be
- * held throughout iteration.
- */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)		\
-	for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);	\
-	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
-
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -522,14 +467,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
-struct wb_iter {
-	int		next_id;
-};
-
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
-	for ((iter)->next_id = (start_blkcg_id);			\
-	     ({	(wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
-
 static inline int inode_congested(struct inode *inode, int cong_bits)
 {
 	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-- 
cgit v1.2.3


From c5edf9cdc4c483b9a94c03fc0b9f769bd090bf3e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 13:04:26 -0400
Subject: writeback: fix incorrect calculation of available memory for memcg
 domains

For memcg domains, the amount of available memory was calculated as

 min(the amount currently in use + headroom according to memcg,
     total clean memory)

This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom.  For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense.  In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.

As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is

 the amount currently in use +
 min(headroom according to memcg, clean memory elsewhere in the system)

This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.

v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
    to build failure when !CGROUP_WRITEBACK.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/memcontrol.h |  8 +++++---
 mm/memcontrol.c            | 35 +++++++++++++++++------------------
 mm/page-writeback.c        | 29 ++++++++++++++++++-----------
 3 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6452ff4c463f..3e3318ddfc0e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -676,8 +676,9 @@ enum {
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-			 unsigned long *pdirty, unsigned long *pwriteback);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+			 unsigned long *pheadroom, unsigned long *pdirty,
+			 unsigned long *pwriteback);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
@@ -687,7 +688,8 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 }
 
 static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
-				       unsigned long *pavail,
+				       unsigned long *pfilepages,
+				       unsigned long *pheadroom,
 				       unsigned long *pdirty,
 				       unsigned long *pwriteback)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1fedbde68f59..882c10cfd0ba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3740,44 +3740,43 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 /**
  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
  * @wb: bdi_writeback in question
- * @pavail: out parameter for number of available pages
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
  * @pdirty: out parameter for number of dirty pages
  * @pwriteback: out parameter for number of pages under writeback
  *
- * Determine the numbers of available, dirty, and writeback pages in @wb's
- * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
- * more involved.
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ * is a bit more involved.
  *
- * A memcg's headroom is "min(max, high) - used".  The available memory is
- * calculated as the lowest headroom of itself and the ancestors plus the
- * number of pages already being used for file pages.  Note that this
- * doesn't consider the actual amount of available memory in the system.
- * The caller should further cap *@pavail accordingly.
+ * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors.  Note that this doesn't consider the actual amount of
+ * available memory in the system.  The caller should further cap
+ * *@pheadroom accordingly.
  */
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-			 unsigned long *pdirty, unsigned long *pwriteback)
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+			 unsigned long *pheadroom, unsigned long *pdirty,
+			 unsigned long *pwriteback)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
-	unsigned long head_room = PAGE_COUNTER_MAX;
-	unsigned long file_pages;
 
 	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 
 	/* this should eventually include NR_UNSTABLE_NFS */
 	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+						     (1 << LRU_ACTIVE_FILE));
+	*pheadroom = PAGE_COUNTER_MAX;
 
-	file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
-						    (1 << LRU_ACTIVE_FILE));
 	while ((parent = parent_mem_cgroup(memcg))) {
 		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
 		unsigned long used = page_counter_read(&memcg->memory);
 
-		head_room = min(head_room, ceiling - min(ceiling, used));
+		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
 		memcg = parent;
 	}
-
-	*pavail = file_pages + head_room;
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 56c0bffa9f49..2c90357c34ea 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -684,13 +684,19 @@ static unsigned long hard_dirty_limit(struct wb_domain *dom,
 	return max(thresh, dom->dirty_limit);
 }
 
-/* memory available to a memcg domain is capped by system-wide clean memory */
-static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+/*
+ * Memory which can be further allocated to a memcg domain is capped by
+ * system-wide clean memory excluding the amount being used in the domain.
+ */
+static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
+			    unsigned long filepages, unsigned long headroom)
 {
 	struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
-	unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+	unsigned long clean = filepages - min(filepages, mdtc->dirty);
+	unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+	unsigned long other_clean = global_clean - min(global_clean, clean);
 
-	mdtc->avail = min(mdtc->avail, clean);
+	mdtc->avail = filepages + min(headroom, other_clean);
 }
 
 /**
@@ -1564,16 +1570,16 @@ static void balance_dirty_pages(struct address_space *mapping,
 		}
 
 		if (mdtc) {
-			unsigned long writeback;
+			unsigned long filepages, headroom, writeback;
 
 			/*
 			 * If @wb belongs to !root memcg, repeat the same
 			 * basic calculations for the memcg domain.
 			 */
-			mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
-					    &writeback);
-			mdtc_cap_avail(mdtc);
+			mem_cgroup_wb_stats(wb, &filepages, &headroom,
+					    &mdtc->dirty, &writeback);
 			mdtc->dirty += writeback;
+			mdtc_calc_avail(mdtc, filepages, headroom);
 
 			domain_dirty_limits(mdtc);
 
@@ -1895,10 +1901,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 		return true;
 
 	if (mdtc) {
-		unsigned long writeback;
+		unsigned long filepages, headroom, writeback;
 
-		mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
-		mdtc_cap_avail(mdtc);
+		mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
+				    &writeback);
+		mdtc_calc_avail(mdtc, filepages, headroom);
 		domain_dirty_limits(mdtc);	/* ditto, ignore writeback */
 
 		if (mdtc->dirty > mdtc->bg_thresh)
-- 
cgit v1.2.3


From b02176f30cd30acccd3b633ab7d9aed8b5da52ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Sep 2015 12:20:22 -0400
Subject: block: don't release bdi while request_queue has live references

bdi's are initialized in two steps, bdi_init() and bdi_register(), but
destroyed in a single step by bdi_destroy() which, for a bdi embedded
in a request_queue, is called during blk_cleanup_queue() which makes
the queue invisible and starts the draining of remaining usages.

A request_queue's user can access the congestion state of the embedded
bdi as long as it holds a reference to the queue.  As such, it may
access the congested state of a queue which finished
blk_cleanup_queue() but hasn't reached blk_release_queue() yet.
Because the congested state was embedded in backing_dev_info which in
turn is embedded in request_queue, accessing the congested state after
bdi_destroy() was called was fine.  The bdi was destroyed but the
memory region for the congested state remained accessible till the
queue got released.

a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in
bdi_writeback") changed the situation.  Now, the root congested state
which is expected to be pinned while request_queue remains accessible
is separately reference counted and the base ref is put during
bdi_destroy().  This means that the root congested state may go away
prematurely while the queue is between bdi_dstroy() and
blk_cleanup_queue(), which was detected by Andrey's KASAN tests.

The root cause of this problem is that bdi doesn't distinguish the two
steps of destruction, unregistration and release, and now the root
congested state actually requires a separate release step.  To fix the
issue, this patch separates out bdi_unregister() and bdi_exit() from
bdi_destroy().  bdi_unregister() is called from blk_cleanup_queue()
and bdi_exit() from blk_release_queue().  bdi_destroy() is now just a
simple wrapper calling the two steps back-to-back.

While at it, the prototype of bdi_destroy() is moved right below
bdi_setup_and_register() so that the counterpart operations are
located together.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in bdi_writeback")
Cc: stable@vger.kernel.org # v4.2+
Reported-and-tested-by: Andrey Konovalov <andreyknvl@google.com>
Link: http://lkml.kernel.org/g/CAAeHK+zUJ74Zn17=rOyxacHU18SgCfC6bsYW=6kCY5GXJBwGfQ@mail.gmail.com
Reviewed-by: Jan Kara <jack@suse.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c            |  2 +-
 block/blk-sysfs.c           |  1 +
 include/linux/backing-dev.h |  6 +++++-
 mm/backing-dev.c            | 12 +++++++++++-
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2eb722d48773..18e92a6645e2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -576,7 +576,7 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	bdi_destroy(&q->backing_dev_info);
+	bdi_unregister(&q->backing_dev_info);
 
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3e44a9da2a13..07b42f5ad797 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -540,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
+	bdi_exit(&q->backing_dev_info);
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 78677e5a65bf..c85f74946a8b 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -19,13 +19,17 @@
 #include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_destroy(struct backing_dev_info *bdi);
+void bdi_exit(struct backing_dev_info *bdi);
 
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+void bdi_unregister(struct backing_dev_info *bdi);
+
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
+void bdi_destroy(struct backing_dev_info *bdi);
+
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e92d77937fd3..9e841399041a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -835,7 +835,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
 	synchronize_rcu_expedited();
 }
 
-void bdi_destroy(struct backing_dev_info *bdi)
+void bdi_unregister(struct backing_dev_info *bdi)
 {
 	/* make sure nobody finds us on the bdi_list anymore */
 	bdi_remove_from_list(bdi);
@@ -847,9 +847,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
 	}
+}
 
+void bdi_exit(struct backing_dev_info *bdi)
+{
+	WARN_ON_ONCE(bdi->dev);
 	wb_exit(&bdi->wb);
 }
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+	bdi_unregister(bdi);
+	bdi_exit(bdi);
+}
 EXPORT_SYMBOL(bdi_destroy);
 
 /*
-- 
cgit v1.2.3


From d976441f44bc5d48635d081d277aa76556ffbf8b Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 19 Oct 2015 11:37:17 +0300
Subject: compiler, atomics, kasan: Provide READ_ONCE_NOCHECK()

Some code may perform racy by design memory reads. This could be
harmless, yet such code may produce KASAN warnings.

To hide such accesses from KASAN this patch introduces
READ_ONCE_NOCHECK() macro. KASAN will not check the memory
accessed by READ_ONCE_NOCHECK(). The KernelThreadSanitizer
(KTSAN) is going to ignore it as well.

This patch creates __read_once_size_nocheck() a clone of
__read_once_size(). The only difference between them is
'no_sanitized_address' attribute appended to '*_nocheck'
function. This attribute tells the compiler that instrumentation
of memory accesses should not be applied to that function. We
declare it as static '__maybe_unsed' because GCC is not capable
to inline such function:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368

With KASAN=n READ_ONCE_NOCHECK() is just a clone of READ_ONCE().

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wolfram Gloger <wmglo@dent.med.uni-muenchen.de>
Cc: kasan-dev <kasan-dev@googlegroups.com>
Link: http://lkml.kernel.org/r/1445243838-17763-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 13 +++++++++
 include/linux/compiler.h     | 66 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index dfaa7b3e9ae9..8efb40e61d6e 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -237,12 +237,25 @@
 #define KASAN_ABI_VERSION 3
 #endif
 
+#if GCC_VERSION >= 40902
+/*
+ * Tell the compiler that address safety instrumentation (KASAN)
+ * should not be applied to that function.
+ * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ */
+#define __no_sanitize_address __attribute__((no_sanitize_address))
+#endif
+
 #endif	/* gcc version >= 40000 specific checks */
 
 #if !defined(__noclone)
 #define __noclone	/* not needed */
 #endif
 
+#if !defined(__no_sanitize_address)
+#define __no_sanitize_address
+#endif
+
 /*
  * A trick to suppress uninitialized variable warning without generating any
  * code
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c836eb2dc44d..3d7810341b57 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -198,19 +198,45 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #include <uapi/linux/types.h>
 
-static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+#define __READ_ONCE_SIZE						\
+({									\
+	switch (size) {							\
+	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;		\
+	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;		\
+	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;		\
+	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;		\
+	default:							\
+		barrier();						\
+		__builtin_memcpy((void *)res, (const void *)p, size);	\
+		barrier();						\
+	}								\
+})
+
+static __always_inline
+void __read_once_size(const volatile void *p, void *res, int size)
 {
-	switch (size) {
-	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
-	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
-	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
-	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
-	default:
-		barrier();
-		__builtin_memcpy((void *)res, (const void *)p, size);
-		barrier();
-	}
+	__READ_ONCE_SIZE;
+}
+
+#ifdef CONFIG_KASAN
+/*
+ * This function is not 'inline' because __no_sanitize_address confilcts
+ * with inlining. Attempt to inline it may cause a build failure.
+ * 	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
+ */
+static __no_sanitize_address __maybe_unused
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#else
+static __always_inline
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
 }
+#endif
 
 static __always_inline void __write_once_size(volatile void *p, void *res, int size)
 {
@@ -248,8 +274,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * required ordering.
  */
 
-#define READ_ONCE(x) \
-	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+#define __READ_ONCE(x, check)						\
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	if (check)							\
+		__read_once_size(&(x), __u.__c, sizeof(x));		\
+	else								\
+		__read_once_size_nocheck(&(x), __u.__c, sizeof(x));	\
+	__u.__val;							\
+})
+#define READ_ONCE(x) __READ_ONCE(x, 1)
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need
+ * to hide memory access from KASAN.
+ */
+#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0)
 
 #define WRITE_ONCE(x, val) \
 ({							\
-- 
cgit v1.2.3


From 67a2e213e7e937c41c52ab5bc46bf3f4de469f6e Mon Sep 17 00:00:00 2001
From: Rohit Vaswani <rvaswani@codeaurora.org>
Date: Thu, 22 Oct 2015 13:32:11 -0700
Subject: mm: cma: fix incorrect type conversion for size during dma allocation

This was found during userspace fuzzing test when a large size dma cma
allocation is made by driver(like ion) through userspace.

  show_stack+0x10/0x1c
  dump_stack+0x74/0xc8
  kasan_report_error+0x2b0/0x408
  kasan_report+0x34/0x40
  __asan_storeN+0x15c/0x168
  memset+0x20/0x44
  __dma_alloc_coherent+0x114/0x18c

Signed-off-by: Rohit Vaswani <rvaswani@codeaurora.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/dma-contiguous.c  | 2 +-
 include/linux/cma.h            | 2 +-
 include/linux/dma-contiguous.h | 4 ++--
 mm/cma.c                       | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 950fff9ce453..a12ff9863d7e 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -187,7 +187,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
  * global one. Requires architecture specific dev_get_cma_area() helper
  * function.
  */
-struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int align)
 {
 	if (align > CONFIG_CMA_ALIGNMENT)
diff --git a/include/linux/cma.h b/include/linux/cma.h
index f7ef093ec49a..29f9e774ab76 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -26,6 +26,6 @@ extern int __init cma_declare_contiguous(phys_addr_t base,
 extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
 					struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align);
+extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
 #endif
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 569bbd039896..fec734df1524 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -111,7 +111,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 	return ret;
 }
 
-struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int order);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 				 int count);
@@ -144,7 +144,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 }
 
 static inline
-struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 				       unsigned int order)
 {
 	return NULL;
diff --git a/mm/cma.c b/mm/cma.c
index e7d1db533025..4eb56badf37e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -361,7 +361,7 @@ err:
  * This function allocates part of contiguous memory on specific
  * contiguous memory area.
  */
-struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 {
 	unsigned long mask, offset, pfn, start = 0;
 	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -371,7 +371,7 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
 	if (!cma || !cma->count)
 		return NULL;
 
-	pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+	pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
 		 count, align);
 
 	if (!count)
-- 
cgit v1.2.3